data_preview.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. @Auther :liuyuqi.gov@msn.cn
  5. @Time :2018/7/5 0:56
  6. @File :data_preview.py
  7. '''
  8. # 后台做图,不需要GUI
  9. # %matplotlib inline
  10. import matplotlib
  11. matplotlib.use('Agg')
  12. # 数据预览
  13. import numpy as np,pandas as pd
  14. import matplotlib.pyplot as plt
  15. def for_df1():
  16. # 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
  17. df1=pd.read_csv("../data/scheduling_preliminary_app_resources_20180606.csv", header=None,names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]))
  18. print(df1.dtypes)
  19. # appid object
  20. # cpu object
  21. # mem object
  22. # disk int64
  23. # P int64
  24. # M int64
  25. # PM int64
  26. print(df1.shape)
  27. # (9338, 7)
  28. # [5 rows x 7 columns]
  29. # print(df1.head())
  30. # app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
  31. tmp=df1["cpu"].str.split('|',expand=True).astype('float')
  32. # [5 rows x 98 columns]
  33. df1["cpu"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
  34. tmp=df1["mem"].str.split('|',expand=True).astype('float')
  35. # [5 rows x 98 columns]
  36. df1["mem"]=tmp.T.mean().T #转置,求均值,再转置回来,这样求得一行的均值。
  37. print(df1.head())
  38. def for_df2():
  39. # 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
  40. df2=pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None,names=list(
  41. ["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
  42. # df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
  43. print(df2.dtypes)
  44. # machineid object
  45. # cpu int
  46. # mem int
  47. # disk int64
  48. # P int64
  49. # M int64
  50. # PM int64
  51. print(df2.shape)
  52. # (6000, 7)
  53. print(df2.head())
  54. # machine_3 32 64 600 7 3 7
  55. def for_df3():
  56. # 主机machine/实例instance/应用app 关系表
  57. df2=pd.read_csv("../data/scheduling_preliminary_instance_deploy_20180606.csv", header=None,names=list(["instanceid", "appid", "machineid"]))
  58. print(df2.dtypes)
  59. print("df数据大小:",df2.shape)
  60. print("instance唯一数量:",df2["instanceid"].unique().shape)
  61. # print(df2["instanceid"])
  62. def for_df4():
  63. # 主机和实例表。部署appid1的insterference最多可以部署n个appid2
  64. df=pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_app_interference_20180606.csv",header=None),columns=list(["appid1","appid2","max_interference"]))
  65. # 查看数据类型
  66. # print(df.dtypes)
  67. print("df数据大小:",df.shape)
  68. # 查看头尾部数据
  69. # app_8361 app_2163 0
  70. # app_6585 app_8959 0
  71. # print(df.head())
  72. # print(df.tail())
  73. # 查看索引
  74. # print(df.index)
  75. # 查看所有列标
  76. # print(df.columns)
  77. # 查看所有数据
  78. # print(df.values)
  79. # 第一列
  80. # df[0].groupby()
  81. # 第二列
  82. # 第三列
  83. # 描述性统计
  84. print("数据预览:",df.describe())
  85. plt.plot(df["max_machine"])
  86. plt.savefig("../submit/fig1.png")