data_predo.py 764 B

123456789101112131415161718192021
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/03/08 23:05:51
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : b.csv数据预处理,获取前500个热门英语单词,长度小于5
  8. '''
  9. import pandas as pd
  10. if __name__=='__main__':
  11. with open("data/b.csv", "r", encoding="utf-8") as file:
  12. res=file.readlines()
  13. res=[x.strip().lower() for x in res]
  14. res = pd.Series(res).drop_duplicates()
  15. data = pd.DataFrame(res, columns=["name"])
  16. data['strlen'] = data['name'].str.len()
  17. # data.sort_values(by='strlen', inplace=True)
  18. data=data[data['strlen'] < 5 ]
  19. data["name"][:500].to_csv("res3.csv", index=False, header=None)