TitleSpider.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # -*- coding = utf-8 -*-
  2. # @Time : 2022/6/21 15:33
  3. # @Author : 刘正阳
  4. # @File : TitleSpider.py
  5. # @Software : PyCharm
  6. import os
  7. import requests
  8. import re
  9. from bs4 import BeautifulSoup
  10. findLink = re.compile(r'"part":"(.*?)","duratio')
  11. global fileName
  12. # 获取网页数据,传入参数:网址
  13. def FinData(url):
  14. dataList = []
  15. getUrl = requests.get(url=url)
  16. bsHtml = BeautifulSoup(getUrl.text, "html.parser")
  17. urlTitleList = bsHtml.get_text().title().split('\n', 1)
  18. urlTitle = urlTitleList[0][:-30].lstrip()
  19. # dataList.append(str(urlTitle))
  20. bsFinData = bsHtml.select('script')
  21. bsData = ''
  22. # 筛选列表数据
  23. for i in bsFinData:
  24. bsData = str(i)
  25. if 'window.__INITIAL_STATE__={' in bsData:
  26. break
  27. # 正则查找,返回列表
  28. reList = re.findall(findLink, bsData)
  29. dataList += reList
  30. return dataList, urlTitle
  31. def saveAsTxt(video_list, urlTitle, path):
  32. fileTitle = urlTitle + ".txt" # 合成.txt格式 文件名
  33. for s in fileTitle:
  34. cut = ['|', '\\', '/', ':', '?', '"', '<', '>']
  35. if s in cut:
  36. fileTitle = fileTitle.replace(s, ' ')
  37. fileTitle = os.path.join(path, fileTitle)
  38. # 去除标题中的Windows不兼容的的命名字
  39. nameFile = open(fileTitle, "w", encoding="utf-8") # 写入文件
  40. j = 0
  41. for i in video_list:
  42. j += 1
  43. nameFile.write(i + "\n")
  44. nameFile.close()
  45. return fileTitle
  46. def GetTxt(bid, path):
  47. global fileName
  48. urlPart = 'https://www.bilibili.com/video/'
  49. bv = bid
  50. url = urlPart + bv
  51. dataList, urlTile = FinData(url)
  52. fileName = saveAsTxt(dataList, urlTile, path)