Browse Source

添加数据初步探索 data_preview.py
sas描述分析文件 desc.sas
注意:.sas文件编码GB2312 !!

liuyuqi-dellpc 6 years ago
parent
commit
fe9706ead0
3 changed files with 141 additions and 0 deletions
  1. 102 0
      code/data_preview.py
  2. 16 0
      code/desc.sas
  3. 23 0
      code/test_pandas.py

+ 102 - 0
code/data_preview.py

@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+@Auther :liuyuqi.gov@msn.cn
+@Time :2018/7/5 0:56
+@File :data_preview.py
+'''
+
+# 后台做图,不需要GUI
+# %matplotlib inline
+import matplotlib
+matplotlib.use('Agg')
+# 数据预览
+import numpy as np,pandas as pd
+import matplotlib.pyplot as plt
+
+def for_df1():
+    # 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
+    df1=pd.read_csv("../data/scheduling_preliminary_app_resources_20180606.csv", header=None,names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]))
+    print(df1.dtypes)
+    # appid    object
+    # cpu      object
+    # mem      object
+    # disk      int64
+    # P         int64
+    # M         int64
+    # PM        int64
+    print(df1.shape)
+    # (9338, 7)
+    # [5 rows x 7 columns]
+    # print(df1.head())
+    # app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
+    tmp=df1["cpu"].str.split('|',expand=True).astype('float')
+    # [5 rows x 98 columns]
+    df1["cpu"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
+
+    tmp=df1["mem"].str.split('|',expand=True).astype('float')
+    # [5 rows x 98 columns]
+    df1["mem"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
+    print(df1.head())
+
+def for_df2():
+    # 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
+    df2=pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None,names=list(
+        ["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
+    # df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
+    print(df2.dtypes)
+    # machineid    object
+    # cpu      int
+    # mem      int
+    # disk      int64
+    # P         int64
+    # M         int64
+    # PM        int64
+    print(df2.shape)
+    # (6000, 7)
+    print(df2.head())
+    # machine_3   32   64   600  7  3   7
+
+def for_df3():
+    # 主机machine/实例instance/应用app 关系表
+    df2=pd.read_csv("../data/scheduling_preliminary_instance_deploy_20180606.csv", header=None,names=list(["instanceid", "appid", "machineid"]))
+    print(df2.dtypes)
+    print("df数据大小:",df2.shape)
+    print("instance唯一数量:",df2["instanceid"].unique().shape)
+    # print(df2["instanceid"])
+
+
+
+def for_df4():
+    # 主机和实例表。部署appid1的insterference最多可以部署n个appid2
+    df=pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_app_interference_20180606.csv",header=None),columns=list(["appid1","appid2","max_interference"]))
+    # 查看数据类型
+    # print(df.dtypes)
+    print("df数据大小:",df.shape)
+
+    # 查看头尾部数据
+    # app_8361  app_2163  0
+    # app_6585  app_8959  0
+    # print(df.head())
+    # print(df.tail())
+
+    # 查看索引
+    # print(df.index)
+    # 查看所有列标
+    # print(df.columns)
+    # 查看所有数据
+    # print(df.values)
+
+    # 第一列
+    # df[0].groupby()
+
+    # 第二列
+
+    # 第三列
+
+    # 描述性统计
+    print("数据预览:",df.describe())
+
+    plt.plot(df["max_machine"])
+    plt.savefig("../submit/fig1.png")
+

+ 16 - 0
code/desc.sas

@@ -0,0 +1,16 @@
+
+PROC IMPORT OUT= WORK.test1 
+            DATAFILE= "D:\liuyuqi\数学建模项目\2018\阿里巴巴全球调度算法
+大赛\ServerManager\data\scheduling_preliminary_app_interference_20180606
+.csv" 
+     DBMS=CSV REPLACE;
+     GETNAMES=YES;
+     DATAROW=2; 
+RUN;
+
+
+
+proc univariate data=WORK.test1;
+	var x1 x2 x3;
+run;
+

+ 23 - 0
code/test_pandas.py

@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+@Auther :liuyuqi.gov@msn.cn
+@Time :2018/7/5 3:08
+@File :test_pandas.py
+'''
+import pandas as pd ,numpy as np
+
+
+def t1():
+    a = [['a', '1.2', '4.2'], ['b', '70', '0.03'], ['x', '5', '0']]
+    df = pd.DataFrame(a, columns=list("ABC"))
+    print(df.dtypes)
+    print(df)
+
+def t2():
+    obj = pd.Series(list('cadaabbcc'))
+    uniques = obj.unique()
+    print(obj.dtypes)
+    print(uniques.shape)
+
+t2()