main2.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. @Auther :liuyuqi.gov@msn.cn
  5. @Time :2018/4/11 1:07
  6. @File :main.py
  7. '''
  8. from bs4 import BeautifulSoup
  9. from wordpress_xmlrpc import Client,WordPressPost
  10. from wordpress_xmlrpc.compat import xmlrpc_client
  11. from wordpress_xmlrpc.methods import media, posts
  12. from wordpress_xmlrpc.methods.posts import GetPosts,NewPost
  13. from wordpress_xmlrpc.methods.users import GetUserInfo
  14. from email.mime.text import MIMEText
  15. from email.header import Header
  16. import re
  17. import time
  18. import smtplib
  19. import traceback
  20. import os,random
  21. import requests
  22. import sys
  23. from conf import *
  24. from myutils import *
  25. user_agents=load_user_agent()
  26. #新闻类
  27. class News(object):
  28. def __init__(self,title,tags,category,content,image_name):
  29. self.title = title #标题
  30. self.tags=tags #标签
  31. self.category=category #分类
  32. self.content=content #内容
  33. self.image_name=image_name
  34. #设置请求头
  35. def setHeader(url):
  36. #抽取URL中的主机名
  37. host=getHost(url)
  38. length = len(user_agents)
  39. index=random.randint(0,length-1)
  40. user_agent = user_agents[index]
  41. headers={
  42. 'Referer': url,
  43. 'Host':host,
  44. 'User-Agent':user_agent,
  45. 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
  46. }
  47. return headers
  48. #获取最新的新闻链接列表
  49. '''
  50. url :需要抓取的网址
  51. n :获取链接的数量,即每次需要发布新文章的数量
  52. links:返回链接列表
  53. '''
  54. def get_urls(url,classname,restr,n=1):
  55. links=[]
  56. headers=setHeader(url)
  57. bsObj=requests.session()
  58. bsObj=BeautifulSoup(bsObj.get(url,headers=headers).content,'html.parser')
  59. #print(bsObj.find('div',{'class':classname}))
  60. for link in bsObj.find('div',{'class':classname}).findAll('a')[0:n]:
  61. if 'href' in link.attrs:
  62. href=link.attrs['href']
  63. #print(href)
  64. if href.startswith('//'):
  65. href='http:'+href
  66. elif href.startswith('/'):
  67. href=url+href
  68. if re.match(restr,href):
  69. links.append(href)
  70. return links
  71. def get_news(url,link,classname):
  72. headers=setHeader(url)
  73. bsObj=requests.session()
  74. art=bsObj.get(link,headers=headers)
  75. #print(art.status_code)
  76. bsObj=BeautifulSoup(art.content,'html.parser')
  77. tit=bsObj.h1
  78. if tit!=None:
  79. title=tit.get_text()
  80. else:
  81. title=bsObj.title.get_text()
  82. print(title)
  83. tags_list=bsObj.find('meta',{'name':'keywords'}).attrs['content']
  84. #print(tags_list)
  85. l=re.split(',',tags_list)
  86. tags=[item for item in filter(lambda x:x != '', l)]
  87. category="其他"
  88. content=bsObj.find('div',{'class':classname})
  89. #查找图片
  90. a_tag=content.find('img')
  91. if a_tag!=None and a_tag.attrs['src']!='':
  92. image_url=a_tag.attrs['src']
  93. image_name=os.path.basename(image_url).split('!')[0]
  94. print(image_url)
  95. #下载图片
  96. get_image(image_url,image_name)
  97. #删除标签
  98. a_tag.extract()
  99. else:
  100. image_name=''
  101. news=News(title,tags,category,content.prettify(),image_name)
  102. return news
  103. #发送新闻到wordpress
  104. '''
  105. user:用户对象
  106. news:新闻对象
  107. '''
  108. def send_news(user,news):
  109. wp=Client(user.website,user.username,user.password)
  110. post=WordPressPost()
  111. if news.image_name!='':
  112. attachment_id=upload_image(news.image_name,wp)
  113. post.thumbnail = attachment_id
  114. post.title=news.title
  115. post.content=news.content
  116. post.post_status ='publish'
  117. post.terms_names={
  118. 'post_tag':news.tags,
  119. 'category':[news.category]
  120. }
  121. wp.call(NewPost(post))
  122. user=readUserConf()
  123. l=getConf()
  124. if len(l)==0:
  125. print('对不起,还没有配置抓取的站点信息')
  126. sys.exit()
  127. for conf in l:
  128. url=conf.url
  129. classname=conf.urltag
  130. newstag=conf.newstag
  131. restr=conf.restr
  132. ulist=get_urls(url,classname,restr,1)
  133. for ul in ulist:
  134. print(ul)
  135. try:
  136. news=get_news(url,ul,newstag)
  137. write_file(news.title+'\n')
  138. send_news(user,news)
  139. time.sleep(5)
  140. except Exception as e:
  141. m=traceback.format_exc()
  142. print(m)
  143. send_email(m)
  144. print('抓取失败:'+ul)
  145. continue