|
@@ -6,17 +6,32 @@
|
|
@File :data_preview.py
|
|
@File :data_preview.py
|
|
'''
|
|
'''
|
|
|
|
|
|
-# 后台做图,不需要GUI
|
|
|
|
-# %matplotlib inline
|
|
|
|
|
|
+# 后台做图,不需要GUI需要在头部第一行加入下面两行代码
|
|
|
|
+# %matplotlib inline jupyter中加入这一行
|
|
import matplotlib
|
|
import matplotlib
|
|
matplotlib.use('Agg')
|
|
matplotlib.use('Agg')
|
|
|
|
+
|
|
# 数据预览
|
|
# 数据预览
|
|
-import numpy as np,pandas as pd
|
|
|
|
|
|
+import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.pyplot as plt
|
|
|
|
+from configparser import ConfigParser
|
|
|
|
+
|
|
|
|
+# step1: 数据参数初始化
|
|
|
|
+
|
|
|
|
+cf = ConfigParser()
|
|
|
|
+config_path = "../conf/config.ini"
|
|
|
|
+section_name = "data_file_name"
|
|
|
|
+cf.read(config_path)
|
|
|
|
+
|
|
|
|
+app_interference = cf.get(section_name, "app_interference")
|
|
|
|
+app_resources = cf.get(section_name, "app_resources")
|
|
|
|
+instance_deploy = cf.get(section_name, "instance_deploy")
|
|
|
|
+machine_resources = cf.get(section_name, "machine_resources")
|
|
|
|
|
|
def for_df1():
|
|
def for_df1():
|
|
# 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
|
|
# 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
|
|
- df1=pd.read_csv("../data/scheduling_preliminary_app_resources_20180606.csv", header=None,names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]))
|
|
|
|
|
|
+ df1 = pd.read_csv(app_resources, header=None,
|
|
|
|
+ names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]))
|
|
print(df1.dtypes)
|
|
print(df1.dtypes)
|
|
# appid object
|
|
# appid object
|
|
# cpu object
|
|
# cpu object
|
|
@@ -30,19 +45,20 @@ def for_df1():
|
|
# [5 rows x 7 columns]
|
|
# [5 rows x 7 columns]
|
|
# print(df1.head())
|
|
# print(df1.head())
|
|
# app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
|
|
# app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
|
|
- tmp=df1["cpu"].str.split('|',expand=True).astype('float')
|
|
|
|
|
|
+ tmp = df1["cpu"].str.split('|', expand=True).astype('float')
|
|
# [5 rows x 98 columns]
|
|
# [5 rows x 98 columns]
|
|
- df1["cpu"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
|
|
|
|
|
|
+ df1["cpu"] = tmp.T.mean().T # 转置,求均值,再转置回来,这样求得一行的均值。
|
|
|
|
|
|
- tmp=df1["mem"].str.split('|',expand=True).astype('float')
|
|
|
|
|
|
+ tmp = df1["mem"].str.split('|', expand=True).astype('float')
|
|
# [5 rows x 98 columns]
|
|
# [5 rows x 98 columns]
|
|
- df1["mem"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
|
|
|
|
|
|
+ df1["mem"] = tmp.T.mean().T # 转置,求均值,再转置回来,这样求得一行的均值。
|
|
print(df1.head())
|
|
print(df1.head())
|
|
- print("总共应用:",df1["appid"].unique().shape)
|
|
|
|
|
|
+ print("总共应用:", df1["appid"].unique().shape)
|
|
|
|
+
|
|
|
|
|
|
def for_df2():
|
|
def for_df2():
|
|
# 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
|
|
# 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
|
|
- df2=pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None,names=list(
|
|
|
|
|
|
+ df2 = pd.read_csv(machine_resources, header=None, names=list(
|
|
["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
|
|
["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
|
|
# df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
|
|
# df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
|
|
print(df2.dtypes)
|
|
print(df2.dtypes)
|
|
@@ -57,30 +73,32 @@ def for_df2():
|
|
# (6000, 7)
|
|
# (6000, 7)
|
|
print(df2.head())
|
|
print(df2.head())
|
|
# machine_3 32 64 600 7 3 7
|
|
# machine_3 32 64 600 7 3 7
|
|
- print("总共主机:",df2["machineid"].unique().shape)
|
|
|
|
|
|
+ print("总共主机:", df2["machineid"].unique().shape)
|
|
# 6000
|
|
# 6000
|
|
|
|
|
|
# 这里主机主要就两类:
|
|
# 这里主机主要就两类:
|
|
# machine_1 32 64 600 7 3 7 数量:3000
|
|
# machine_1 32 64 600 7 3 7 数量:3000
|
|
# machine_2 92 288 1024 7 7 9 数量:3000
|
|
# machine_2 92 288 1024 7 7 9 数量:3000
|
|
|
|
|
|
|
|
+
|
|
def for_df3():
|
|
def for_df3():
|
|
# 主机machine/实例instance/应用app 关系表
|
|
# 主机machine/实例instance/应用app 关系表
|
|
- df3=pd.read_csv("../data/scheduling_preliminary_instance_deploy_20180606.csv", header=None,names=list(["instanceid", "appid", "machineid"]))
|
|
|
|
|
|
+ df3 = pd.read_csv(instance_deploy, header=None,
|
|
|
|
+ names=list(["instanceid", "appid", "machineid"]))
|
|
print(df3.dtypes)
|
|
print(df3.dtypes)
|
|
- print("df数据大小:",df3.shape)
|
|
|
|
- print("instance唯一数量:",df3["instanceid"].unique().shape)
|
|
|
|
|
|
+ print("df数据大小:", df3.shape)
|
|
|
|
+ print("instance唯一数量:", df3["instanceid"].unique().shape)
|
|
# print(df2["instanceid"])
|
|
# print(df2["instanceid"])
|
|
- print("总共实例:",df3["instanceid"].unique().shape)
|
|
|
|
-
|
|
|
|
|
|
+ print("总共实例:", df3["instanceid"].unique().shape)
|
|
|
|
|
|
|
|
|
|
def for_df4():
|
|
def for_df4():
|
|
# 主机和实例表。部署appid1的insterference最多可以部署n个appid2
|
|
# 主机和实例表。部署appid1的insterference最多可以部署n个appid2
|
|
- df=pd.read_csv("../data/scheduling_preliminary_app_interference_20180606.csv",header=None,names=list(["appid1","appid2","max_interference"]))
|
|
|
|
|
|
+ df = pd.read_csv(app_interference, header=None,
|
|
|
|
+ names=list(["appid1", "appid2", "max_interference"]))
|
|
# 查看数据类型
|
|
# 查看数据类型
|
|
# print(df.dtypes)
|
|
# print(df.dtypes)
|
|
- print("df数据大小:",df.shape)
|
|
|
|
|
|
+ print("df数据大小:", df.shape)
|
|
|
|
|
|
# 查看头尾部数据
|
|
# 查看头尾部数据
|
|
# app_8361 app_2163 0
|
|
# app_8361 app_2163 0
|
|
@@ -103,9 +121,10 @@ def for_df4():
|
|
# 第三列
|
|
# 第三列
|
|
|
|
|
|
# 描述性统计
|
|
# 描述性统计
|
|
- print("数据预览:",df.describe())
|
|
|
|
|
|
+ print("数据预览:", df.describe())
|
|
|
|
|
|
plt.plot(df["max_interference"])
|
|
plt.plot(df["max_interference"])
|
|
plt.savefig("../submit/fig1.png")
|
|
plt.savefig("../submit/fig1.png")
|
|
|
|
|
|
-for_df4()
|
|
|
|
|
|
+
|
|
|
|
+for_df4()
|