main.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2017年6月26日
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc :
  8. '''
  9. import os,datetime
  10. import unicodecsv as csv
  11. from bs4 import BeautifulSoup
  12. import pandas as pd
  13. class GooExport(object):
  14. """ docstring for GooExport """
  15. def __init__(self, inputDataPath, outResultFile):
  16. self.inputDataPath=inputDataPath
  17. self.outResultFile=outResultFile
  18. self.job = pd.DataFrame()
  19. def parseXml(self, dataFile):
  20. # print(dataFile,"------------------")
  21. with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
  22. xml_doc =f.read() #读取xml文本内容
  23. # 去除空格和换行
  24. xml_doc=xml_doc.replace("\n", "")
  25. xml_doc=xml_doc.replace(" ", "")
  26. # xml 形式读取
  27. soup = BeautifulSoup(xml_doc, "lxml-xml")
  28. needData=soup.findChild("job")
  29. for i in range(len(needData.contents)):
  30. data=dict()
  31. for j in range(len(needData.contents[i].contents)):
  32. try:
  33. key=needData.contents[i].contents[j].name
  34. value=needData.contents[i].contents[j].contents[0]
  35. data[key]=value
  36. except Exception as e:
  37. print(e)
  38. # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段
  39. new_df = pd.DataFrame([data]) # Convert the data to a DataFrame
  40. self.job = pd.concat([self.job, new_df], ignore_index=True)
  41. self.removeDuplication()
  42. def removeDuplication(self):
  43. # print("removeDuplication")
  44. self.job=self.job.drop_duplicates()
  45. def run(self):
  46. print("start!")
  47. os.chdir(self.inputDataPath)
  48. for lists in os.listdir(self.inputDataPath):
  49. if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"):
  50. self.parseXml(os.path.join(self.inputDataPath, lists))
  51. with open(self.outResultFile, 'wb') as f:
  52. writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore")
  53. writer.writerow(self.job.columns)
  54. writer.writerows(self.job.values)
  55. print("finish!")
  56. if __name__=='__main__':
  57. from argparse import ArgumentParser
  58. parser = ArgumentParser()
  59. parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str)
  60. parser.add_argument("-o", "--outResultFile", help="outResultFile", default="", type=str)
  61. args = parser.parse_args()
  62. inputDataPath=args.inputDataPath
  63. outResultFile=args.outResultFile
  64. if outResultFile == "":
  65. outResultFile= os.path.join(inputDataPath, "result%s.csv" % datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
  66. if inputDataPath=="" or outResultFile=="":
  67. print("please input inputDataPath and outResultFile")
  68. exit()
  69. gooExport = GooExport(inputDataPath, outResultFile)
  70. gooExport.run()