123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- '''
- @Auther :liuyuqi.gov@msn.cn
- @Time :2018/4/11 1:07
- @File :main.py
- '''
- from bs4 import BeautifulSoup
- from wordpress_xmlrpc import Client,WordPressPost
- from wordpress_xmlrpc.compat import xmlrpc_client
- from wordpress_xmlrpc.methods import media, posts
- from wordpress_xmlrpc.methods.posts import GetPosts,NewPost
- from wordpress_xmlrpc.methods.users import GetUserInfo
- from email.mime.text import MIMEText
- from email.header import Header
- import re
- import time
- import smtplib
- import traceback
- import os,random
- import requests
- import sys
- from conf import *
- from myutils import *
- user_agents=load_user_agent()
- #新闻类
- class News(object):
- def __init__(self,title,tags,category,content,image_name):
- self.title = title #标题
- self.tags=tags #标签
- self.category=category #分类
- self.content=content #内容
- self.image_name=image_name
- #设置请求头
- def setHeader(url):
- #抽取URL中的主机名
- host=getHost(url)
- length = len(user_agents)
- index=random.randint(0,length-1)
- user_agent = user_agents[index]
- headers={
- 'Referer': url,
- 'Host':host,
- 'User-Agent':user_agent,
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
- }
- return headers
- #获取最新的新闻链接列表
- '''
- url :需要抓取的网址
- n :获取链接的数量,即每次需要发布新文章的数量
- links:返回链接列表
- '''
- def get_urls(url,classname,restr,n=1):
- links=[]
- headers=setHeader(url)
- bsObj=requests.session()
- bsObj=BeautifulSoup(bsObj.get(url,headers=headers).content,'html.parser')
- #print(bsObj.find('div',{'class':classname}))
- for link in bsObj.find('div',{'class':classname}).findAll('a')[0:n]:
- if 'href' in link.attrs:
- href=link.attrs['href']
- #print(href)
- if href.startswith('//'):
- href='http:'+href
- elif href.startswith('/'):
- href=url+href
- if re.match(restr,href):
- links.append(href)
- return links
- def get_news(url,link,classname):
- headers=setHeader(url)
- bsObj=requests.session()
- art=bsObj.get(link,headers=headers)
- #print(art.status_code)
- bsObj=BeautifulSoup(art.content,'html.parser')
- tit=bsObj.h1
- if tit!=None:
- title=tit.get_text()
- else:
- title=bsObj.title.get_text()
- print(title)
- tags_list=bsObj.find('meta',{'name':'keywords'}).attrs['content']
- #print(tags_list)
- l=re.split(',',tags_list)
- tags=[item for item in filter(lambda x:x != '', l)]
- category="其他"
- content=bsObj.find('div',{'class':classname})
- #查找图片
- a_tag=content.find('img')
- if a_tag!=None and a_tag.attrs['src']!='':
- image_url=a_tag.attrs['src']
- image_name=os.path.basename(image_url).split('!')[0]
- print(image_url)
- #下载图片
- get_image(image_url,image_name)
- #删除标签
- a_tag.extract()
- else:
- image_name=''
- news=News(title,tags,category,content.prettify(),image_name)
- return news
- #发送新闻到wordpress
- '''
- user:用户对象
- news:新闻对象
- '''
- def send_news(user,news):
- wp=Client(user.website,user.username,user.password)
- post=WordPressPost()
- if news.image_name!='':
- attachment_id=upload_image(news.image_name,wp)
- post.thumbnail = attachment_id
- post.title=news.title
- post.content=news.content
- post.post_status ='publish'
- post.terms_names={
- 'post_tag':news.tags,
- 'category':[news.category]
- }
- wp.call(NewPost(post))
- user=readUserConf()
- l=getConf()
- if len(l)==0:
- print('对不起,还没有配置抓取的站点信息')
- sys.exit()
- for conf in l:
- url=conf.url
- classname=conf.urltag
- newstag=conf.newstag
- restr=conf.restr
- ulist=get_urls(url,classname,restr,1)
- for ul in ulist:
- print(ul)
- try:
- news=get_news(url,ul,newstag)
- write_file(news.title+'\n')
- send_news(user,news)
- time.sleep(5)
- except Exception as e:
- m=traceback.format_exc()
- print(m)
- send_email(m)
- print('抓取失败:'+ul)
- continue
|