data_preview.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. @Auther :liuyuqi.gov@msn.cn
  5. @Time :2018/7/5 0:56
  6. @File :data_preview.py
  7. '''
  8. # 后台做图,不需要GUI
  9. # %matplotlib inline
  10. import matplotlib
  11. matplotlib.use('Agg')
  12. # 数据预览
  13. import numpy as np,pandas as pd
  14. import matplotlib.pyplot as plt
  15. def for_df1():
  16. # 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
  17. df1=pd.read_csv("../data/scheduling_preliminary_app_resources_20180606.csv", header=None,names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]))
  18. print(df1.dtypes)
  19. # appid object
  20. # cpu object
  21. # mem object
  22. # disk int64
  23. # P int64
  24. # M int64
  25. # PM int64
  26. print(df1.shape)
  27. # (9338, 7)
  28. # [5 rows x 7 columns]
  29. # print(df1.head())
  30. # app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
  31. tmp=df1["cpu"].str.split('|',expand=True).astype('float')
  32. # [5 rows x 98 columns]
  33. df1["cpu"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
  34. tmp=df1["mem"].str.split('|',expand=True).astype('float')
  35. # [5 rows x 98 columns]
  36. df1["mem"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
  37. print(df1.head())
  38. print("总共应用:",df1["appid"].unique().shape)
  39. def for_df2():
  40. # 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
  41. df2=pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None,names=list(
  42. ["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
  43. # df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
  44. print(df2.dtypes)
  45. # machineid object
  46. # cpu int
  47. # mem int
  48. # disk int64
  49. # P int64
  50. # M int64
  51. # PM int64
  52. print(df2.shape)
  53. # (6000, 7)
  54. print(df2.head())
  55. # machine_3 32 64 600 7 3 7
  56. print("总共主机:",df2["machineid"].unique().shape)
  57. # 6000
  58. # 这里主机主要就两类:
  59. # machine_1 32 64 600 7 3 7 数量:3000
  60. # machine_2 92 288 1024 7 7 9 数量:3000
  61. def for_df3():
  62. # 主机machine/实例instance/应用app 关系表
  63. df3=pd.read_csv("../data/scheduling_preliminary_instance_deploy_20180606.csv", header=None,names=list(["instanceid", "appid", "machineid"]))
  64. print(df3.dtypes)
  65. print("df数据大小:",df3.shape)
  66. print("instance唯一数量:",df3["instanceid"].unique().shape)
  67. # print(df2["instanceid"])
  68. print("总共实例:",df3["instanceid"].unique().shape)
  69. def for_df4():
  70. # 主机和实例表。部署appid1的insterference最多可以部署n个appid2
  71. df=pd.read_csv("../data/scheduling_preliminary_app_interference_20180606.csv",header=None,names=list(["appid1","appid2","max_interference"]))
  72. # 查看数据类型
  73. # print(df.dtypes)
  74. print("df数据大小:",df.shape)
  75. # 查看头尾部数据
  76. # app_8361 app_2163 0
  77. # app_6585 app_8959 0
  78. # print(df.head())
  79. # print(df.tail())
  80. # 查看索引
  81. # print(df.index)
  82. # 查看所有列标
  83. # print(df.columns)
  84. # 查看所有数据
  85. # print(df.values)
  86. # 第一列
  87. # df[0].groupby()
  88. # 第二列
  89. # 第三列
  90. # 描述性统计
  91. print("数据预览:",df.describe())
  92. plt.plot(df["max_interference"])
  93. plt.savefig("../submit/fig1.png")
  94. for_df4()