1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import os
- import chardet
- import codecs
- import re
- def WriteFile(filePath, u, encoding="utf-8"):
- with codecs.open(filePath, "w", encoding) as f:
- print(filePath)
- f.write(u)
- def convert(src, dst):
- # 检测编码,coding可能检测不到编码,有异常
- f = open(src, "rb")
- coding = chardet.detect(f.read())["encoding"]
- f.close()
- with codecs.open(src, "r", coding) as f:
- try:
- pattern = 'href="javascript:if\(confirm\([^"]*"'
- pattern_url='(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
- data = f.read()
- # 获取所有正则匹配
- out = re.findall(pattern, data, flags=0)
- if out:
- for eachOut in out :
- # 取出每个链接
- url='href="'+re.search(pattern_url,eachOut,flags=0).group()+'"'
- data=data.replace(eachOut, url)
- WriteFile(dst, data, encoding="utf-8")
- except Exception:
- print(src + " " + coding + " read error")
- # 把目录中的*.java编码由gbk转换为utf-8
- def fix(rootdir):
- for parent, dirnames, filenames in os.walk(rootdir):
- for dirname in dirnames:
- # 递归函数,遍历所有子文件夹
- fix(dirname)
- for filename in filenames:
- if filename.endswith(".html"):
- convert(os.path.join(parent, filename),
- os.path.join(parent, filename))
- if __name__ == "__main__":
- src_path = "E:/share/linux/100/www.100.me"
- fix(src_path)
|