#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2017年6月26日 @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. @Desc : ''' import os,datetime import unicodecsv as csv from bs4 import BeautifulSoup import pandas as pd class GooExport(object): """ docstring for GooExport """ def __init__(self, inputDataPath, outResultFile): self.inputDataPath=inputDataPath self.outResultFile=outResultFile self.job = pd.DataFrame() def parseXml(self, dataFile): # print(dataFile,"------------------") with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f: xml_doc =f.read() #读取xml文本内容 # 去除空格和换行 xml_doc=xml_doc.replace("\n", "") xml_doc=xml_doc.replace(" ", "") # xml 形式读取 soup = BeautifulSoup(xml_doc, "lxml-xml") needData=soup.findChild("job") for i in range(len(needData.contents)): data=dict() for j in range(len(needData.contents[i].contents)): try: key=needData.contents[i].contents[j].name value=needData.contents[i].contents[j].contents[0] data[key]=value except Exception as e: print(e) # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段 new_df = pd.DataFrame([data]) # Convert the data to a DataFrame self.job = pd.concat([self.job, new_df], ignore_index=True) self.removeDuplication() def removeDuplication(self): # print("removeDuplication") self.job=self.job.drop_duplicates() def run(self): print("start!") os.chdir(self.inputDataPath) for lists in os.listdir(self.inputDataPath): if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"): self.parseXml(os.path.join(self.inputDataPath, lists)) with open(self.outResultFile, 'wb') as f: writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore") writer.writerow(self.job.columns) writer.writerows(self.job.values) print("finish!") if __name__=='__main__': from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str) parser.add_argument("-o", "--outResultFile", help="outResultFile", default="", type=str) args = parser.parse_args() inputDataPath=args.inputDataPath outResultFile=args.outResultFile if outResultFile == "": outResultFile= os.path.join(inputDataPath, "result%s.csv" % datetime.datetime.now().strftime("%Y%m%d%H%M%S")) if inputDataPath=="" or outResultFile=="": print("please input inputDataPath and outResultFile") exit() gooExport = GooExport(inputDataPath, outResultFile) gooExport.run()