0.前言
最近爱情公寓上线了大电影,卖的就是情怀,但是据说豆瓣上已经给出了2.7的评分,也就比第一烂片逐梦演艺圈高那么一点点,于是今天写了一个爬虫,爬了豆瓣一千多条短评,看一看这个电影为什么这么渣。
1.代码
废话不说,直接代码,因为还是挺简单的
from lxml import etreefrom urllib import requestimport sslfrom jieba import analyseimport jiebafrom wordcloud import WordCloud, STOPWORDSfrom matplotlib import pyplot as plt//得到所有短评def get_comments(comments): print(comments) root = etree.HTML(comments) comment_content = '' comment_list = root.xpath('//*[@class="short-content"]/text()') print('comment size is %d' % len(comment_list)) for comment in comment_list: comment_content += comment comment_content = comment_content.replace('()', '') print("comment_content is %s" % comment_content) return comment_content//生成词云def generate_wordcloud(comments): stopwords = set(STOPWORDS) stopwords.add('爱情') stopwords.add('公寓') stopwords.add('电影') stopwords.add('评分') stopwords.add('...') comments = jieba.cut(comments) comments = [comment for comment in comments if comment not in stopwords] comments = ' '.join(comments) comments = str(analyse.extract_tags(comments, topK=300)) background = plt.imread('pic.jpg') wc = WordCloud(font_path='msyh.ttf', random_state=30, width=800, height=800, mask=background, background_color='white', max_font_size=80, max_words=2000) wc = wc.generate_from_text(comments) plt.imshow(wc) plt.axis('off') plt.show()//调用上面的方法生成词云url = 'https://movie.douban.com/subject/24852545/reviews?start=%d'ssl._create_default_https_context = ssl._create_unverified_contextcomments = ''for i in range(0, 20): offset_url = url % (20 * i) print("url is %s" % offset_url) with request.urlopen(offset_url) as f: content = f.read() comment = get_comments(content) comments += commentgenerate_wordcloud(comments)复制代码