123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- '''
- @Auther :liuyuqi.gov@msn.cn
- @Time :2018/7/5 0:56
- @File :data_preview.py
- '''
- # 后台做图,不需要GUI
- # %matplotlib inline
- import matplotlib
- matplotlib.use('Agg')
- # 数据预览
- import numpy as np,pandas as pd
- import matplotlib.pyplot as plt
- def for_df1():
- # 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
- df1=pd.read_csv("../data/scheduling_preliminary_app_resources_20180606.csv", header=None,names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]))
- print(df1.dtypes)
- # appid object
- # cpu object
- # mem object
- # disk int64
- # P int64
- # M int64
- # PM int64
- print(df1.shape)
- # (9338, 7)
- # [5 rows x 7 columns]
- # print(df1.head())
- # app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
- tmp=df1["cpu"].str.split('|',expand=True).astype('float')
- # [5 rows x 98 columns]
- df1["cpu"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
- tmp=df1["mem"].str.split('|',expand=True).astype('float')
- # [5 rows x 98 columns]
- df1["mem"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
- print(df1.head())
- print("总共应用:",df1["appid"].unique().shape)
- def for_df2():
- # 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
- df2=pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None,names=list(
- ["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
- # df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
- print(df2.dtypes)
- # machineid object
- # cpu int
- # mem int
- # disk int64
- # P int64
- # M int64
- # PM int64
- print(df2.shape)
- # (6000, 7)
- print(df2.head())
- # machine_3 32 64 600 7 3 7
- print("总共主机:",df2["machineid"].unique().shape)
- # 6000
- # 这里主机主要就两类:
- # machine_1 32 64 600 7 3 7 数量:3000
- # machine_2 92 288 1024 7 7 9 数量:3000
- def for_df3():
- # 主机machine/实例instance/应用app 关系表
- df3=pd.read_csv("../data/scheduling_preliminary_instance_deploy_20180606.csv", header=None,names=list(["instanceid", "appid", "machineid"]))
- print(df3.dtypes)
- print("df数据大小:",df3.shape)
- print("instance唯一数量:",df3["instanceid"].unique().shape)
- # print(df2["instanceid"])
- print("总共实例:",df3["instanceid"].unique().shape)
- def for_df4():
- # 主机和实例表。部署appid1的insterference最多可以部署n个appid2
- df=pd.read_csv("../data/scheduling_preliminary_app_interference_20180606.csv",header=None,names=list(["appid1","appid2","max_interference"]))
- # 查看数据类型
- # print(df.dtypes)
- print("df数据大小:",df.shape)
- # 查看头尾部数据
- # app_8361 app_2163 0
- # app_6585 app_8959 0
- # print(df.head())
- # print(df.tail())
- # 查看索引
- # print(df.index)
- # 查看所有列标
- # print(df.columns)
- # 查看所有数据
- # print(df.values)
- # 第一列
- # df[0].groupby()
- # 第二列
- # 第三列
- # 描述性统计
- print("数据预览:",df.describe())
- plt.plot(df["max_interference"])
- plt.savefig("../submit/fig1.png")
- for_df4()
|