123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Author : liuyuqi
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2019/08/27 15:30:32
- @Version : 1.0
- @License : (C)Copyright 2019
- @Desc : 微博热搜爬虫,分析
- '''
- ## 调用要使用的包
- from pyecharts.charts import Page, WordCloud, TreeMap
- from pyecharts.charts import Calendar, Pie
- from pyecharts.globals import SymbolType
- from pyecharts import GraphicShapeOpts
- from pyecharts import options as opts
- import json
- import random
- import requests
- import time
- import pandas as pd
- import os
- import jieba
- from scipy.misc import imread # 这是一个处理图像的函数
- from wordcloud import WordCloud, ImageColorGenerator
- import matplotlib.pyplot as plt
- import datetime
- from collections import Counter
- os.chdir('D:/爬虫/微博热搜')
- ## 获得日期
- def getBetweenDay(begin_date, end_date):
- date_list = []
- begin_date = datetime.datetime.strptime(begin_date, "%Y/%m/%d")
- end_date = datetime.datetime.strptime(end_date, "%Y/%m/%d")
- while begin_date <= end_date:
- date_str = begin_date.strftime("%Y/%m/%d")
- date_list.append(date_str)
- begin_date += datetime.timedelta(days=1)
- return date_list
- ## 分词
- def get_words_list(df):
- df['words_list'] = []
- word_generator = jieba.cut_for_search(df['title'])
- for word in word_generator:
- df['words_list'].append(word)
- return df
- ## explode
- def dataframe_explode(dataframe, fieldname):
- temp_fieldname = fieldname + '_made_tuple_'
- dataframe[temp_fieldname] = dataframe[fieldname].apply(tuple)
- list_of_dataframes = []
- for values in dataframe[temp_fieldname].unique().tolist():
- list_of_dataframes.append(pd.DataFrame({
- temp_fieldname: [values] * len(values),
- fieldname: list(values),
- }))
- dataframe = dataframe[list(set(dataframe.columns) - set([fieldname]))].merge(
- pd.concat(list_of_dataframes), how='left', on=temp_fieldname)
- del dataframe[temp_fieldname]
- return dataframe
- ## 绘制词云
- def draw_word_cloud(word):
- word_title = resou_word[resou_word['title'].str.contains(word)]
- word_title = word_title.groupby(
- ['title'], as_index=False).agg({'searchCount': ['max']})
- word_title.columns = ['title', 'count']
- data = [(word_title['title'][i], word_title['count'][i]/1000000)
- for i in range(word_title.shape[0])]
- wc = (WordCloud(init_opts=opts.InitOpts(theme=ThemeType.ROMA))
- .add("", data, word_size_range=[20, 50], shape='pentagon')
- .set_global_opts(title_opts=opts.TitleOpts(title=''))
- .render('{}词云.html'.format(word))
- )
- ## 设置headers和cookie,数据爬取
- header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win32; x32; rv:54.0) Gecko/20100101 Firefox/54.0',
- 'Connection': 'keep-alive'}
- cookies = 'v=3; iuuid=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; webp=true; ci=1%2C%E5%8C%97%E4%BA%AC; __guid=26581345.3954606544145667000.1530879049181.8303; _lxsdk_cuid=1646f808301c8-0a4e19f5421593-5d4e211f-100200-1646f808302c8; _lxsdk=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; monitor_count=1; _lxsdk_s=16472ee89ec-de2-f91-ed0%7C%7C5; __mta=189118996.1530879050545.1530936763555.1530937843742.18'
- cookie = {}
- for line in cookies.split(';'):
- name, value = cookies.strip().split('=', 1)
- cookie[name] = value
- resou = pd.DataFrame(columns=['date', 'title', 'searchCount', 'rank'])
- resou_date = getBetweenDay('2019/01/01', '2019/07/12')
- for i in resou_date:
- print(i)
- url = 'https://www.enlightent.cn/research/top/getWeiboHotSearchDayAggs.do?date={}'.format(
- str(i))
- html = requests.get(url=url, cookies=cookie, headers=header).content
- data = json.loads(html.decode('utf-8'))
- for j in range(100):
- resou = resou.append({'date': i, 'title': data[j]['keyword'],
- 'searchCount': data[j]['searchCount'], 'rank': j+1}, ignore_index=True)
- ## 按天统计
- resou = resou.apply(get_words_list, axis=1)
- resou.to_excel('热搜数据.xlsx')
- resou_dt = resou.groupby('date', as_index=False).agg({'searchCount': ['mean']})
- resou_dt.columns = ['date', 'avg_count']
- ## 绘制日历图
- data = [
- [resou_dt['date'][i], resou_dt['avg_count'][i]]
- for i in range(resou_dt.shape[0])
- ]
- calendar = (
- Calendar(init_opts=opts.InitOpts(width='1800px', height='1500px'))
- .add("", data, calendar_opts=opts.CalendarOpts(range_=['2019-01-01', '2019-07-12']))
- .set_global_opts(
- title_opts=opts.TitleOpts(title="2019每日热搜平均指数", pos_left='15%'),
- visualmap_opts=opts.VisualMapOpts(
- max_=3600000,
- min_=0,
- orient="horizontal",
- is_piecewise=False,
- pos_top="230px",
- pos_left="100px",
- pos_right="10px"
- )
- )
- .render('日期热力图.html')
- )
- ## 词频统计
- resou_word = dataframe_explode(resou, 'words_list')
- resou_word_stat = resou_word.groupby(
- ['words_list'], as_index=False).agg({'title': ['count']})
- resou_word_stat.columns = ['word', 'num']
- resou_word_stat.to_excel('词频统计.xlsx')
- func = pd.read_excel('名词.xlsx')
- love = pd.read_excel('婚恋.xlsx')
- person = pd.read_excel('人物.xlsx')
- resou_word_func = pd.merge(resou_word_stat, func, how='inner', on='word')
- resou_word_love = pd.merge(resou_word_stat, love, how='inner', on='word')
- resou_word_person = pd.merge(resou_word_stat, person, how='inner', on='word')
- resou_word_func = resou_word_func.sort_values(
- 'count', ascending=False).reset_index()
- words = [(resou_word_func['word'][i], resou_word_func['num'][i])
- for i in range(resou_word_func.shape[0])]
- pie = (
- Pie(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
- .add("", words)
- .set_global_opts(title_opts=opts.TitleOpts(title="微博常用词出现次数", pos_left='center'),
- legend_opts=opts.LegendOpts(is_show=False))
- .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}", font_size=16),)
- .render('热搜词饼图.html')
- )
- resou_word_love = resou_word_love.sort_values(
- 'count', ascending=False).reset_index()
- words = [(resou_word_love['word'][i], resou_word_love['num'][i])
- for i in range(resou_word_love.shape[0])]
- pie = (
- Pie(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
- .add("", words)
- .set_global_opts(title_opts=opts.TitleOpts(title="微博婚恋类词语出现次数", pos_left='center'),
- legend_opts=opts.LegendOpts(is_show=False))
- .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}", font_size=16),)
- .render('婚恋类热搜词饼图.html')
- )
- resou_word_person = resou_word_person.sort_values(
- 'num', ascending=False).reset_index()
- words = [{'"value": {}, "name": {}"'.format(str(resou_word_person['num'][i]),
- resou_word_person['name'][i])} for i in range(resou_word_person.shape[0])]
- tree = (
- TreeMap(init_opts=opts.InitOpts(theme=ThemeType.ESSOS))
- .add("", words, pos_left=0, pos_right=0, pos_top=50, pos_bottom=50)
- .set_global_opts(title_opts=opts.TitleOpts(title="热搜明星出现次数排名", pos_left='center'),
- legend_opts=opts.LegendOpts(is_show=False))
- .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}\n\n {c}", font_size=17,
- color='black', position='inside', font_weight='bolder'))
- .render('排序.html')
- )
|