12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2017年6月26日
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc :
- '''
- import os,datetime
- import unicodecsv as csv
- from bs4 import BeautifulSoup
- import pandas as pd
- class GooExport(object):
- """ docstring for GooExport """
- def __init__(self, inputDataPath, outResultFile):
- self.inputDataPath=inputDataPath
- self.outResultFile=outResultFile
- self.job = pd.DataFrame()
- def parseXml(self, dataFile):
- # print(dataFile,"------------------")
- with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
- xml_doc =f.read() #读取xml文本内容
- # 去除空格和换行
- xml_doc=xml_doc.replace("\n", "")
- xml_doc=xml_doc.replace(" ", "")
- # xml 形式读取
- soup = BeautifulSoup(xml_doc, "lxml-xml")
-
- needData=soup.findChild("job")
- for i in range(len(needData.contents)):
- data=dict()
- for j in range(len(needData.contents[i].contents)):
- try:
- key=needData.contents[i].contents[j].name
- value=needData.contents[i].contents[j].contents[0]
- data[key]=value
- except Exception as e:
- print(e)
- # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段
- new_df = pd.DataFrame([data]) # Convert the data to a DataFrame
- self.job = pd.concat([self.job, new_df], ignore_index=True)
- self.removeDuplication()
- def removeDuplication(self):
- # print("removeDuplication")
- self.job=self.job.drop_duplicates()
- def run(self):
- print("start!")
- os.chdir(self.inputDataPath)
- for lists in os.listdir(self.inputDataPath):
- if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"):
- self.parseXml(os.path.join(self.inputDataPath, lists))
- with open(self.outResultFile, 'wb') as f:
- writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore")
- writer.writerow(self.job.columns)
- writer.writerows(self.job.values)
- print("finish!")
- if __name__=='__main__':
- from argparse import ArgumentParser
- parser = ArgumentParser()
- parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str)
- parser.add_argument("-o", "--outResultFile", help="outResultFile", default="", type=str)
- args = parser.parse_args()
- inputDataPath=args.inputDataPath
- outResultFile=args.outResultFile
- if outResultFile == "":
- outResultFile= os.path.join(inputDataPath, "result%s.csv" % datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
-
- if inputDataPath=="" or outResultFile=="":
- print("please input inputDataPath and outResultFile")
- exit()
- gooExport = GooExport(inputDataPath, outResultFile)
- gooExport.run()
|