import os import chardet import codecs import re def WriteFile(filePath, u, encoding="utf-8"): with codecs.open(filePath, "w", encoding) as f: print(filePath) f.write(u) def convert(src, dst): # 检测编码,coding可能检测不到编码,有异常 f = open(src, "rb") coding = chardet.detect(f.read())["encoding"] f.close() with codecs.open(src, "r", coding) as f: try: pattern = 'href="javascript:if\(confirm\([^"]*"' pattern_url='(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' data = f.read() # 获取所有正则匹配 out = re.findall(pattern, data, flags=0) if out: for eachOut in out : # 取出每个链接 url='href="'+re.search(pattern_url,eachOut,flags=0).group()+'"' data=data.replace(eachOut, url) WriteFile(dst, data, encoding="utf-8") except Exception: print(src + " " + coding + " read error") # 把目录中的*.java编码由gbk转换为utf-8 def fix(rootdir): for parent, dirnames, filenames in os.walk(rootdir): for dirname in dirnames: # 递归函数,遍历所有子文件夹 fix(dirname) for filename in filenames: if filename.endswith(".html"): convert(os.path.join(parent, filename), os.path.join(parent, filename)) if __name__ == "__main__": src_path = "E:/share/linux/100/www.100.me" fix(src_path)