想2秒搜索到高质量的b站视频？全自动弹幕词云搜索值得拥有

piaodoo 编程教程 2020-02-22 22:16:13 1193 0 python教程

本文来源吾爱破解论坛

本帖最后由 54048371 于 2020-2-18 23:03 编辑

2020-2-18更新：
本次更新亮点：
1.支持输入关键词自动搜索b站视频
2.支持自定义爬取词云图片的页码范围
3.支持自定搜索排序方式，支持综合排序、按照弹幕排序等
4.支持自定义搜索视频的时长范围
注意事项：需要在代码修改你想保存词云图片的位置，这里默认了F盘，其次wordcloud需要自己设定字体文件的位置，可以自行修改，Windows字体一般保存在c盘Windows下的fonts文件夹里面，自行替换即可

食用方法：该脚本的依赖库安装可能需要费一点心思，例如wordcloud，但好在网上该问题教程丰富，简单易懂，也好上手

import re,requests,os,time
from lxml import etree
from wordcloud import WordCloud
安装好以上的库咱们就可以欢快地食用该程序啦！
附赠一TIPS:由于py库安装默认国外下载源，10k/s的速度在几百兆时代怎么能忍受所以推荐使用一下两个国内镜像
pip install -i https://pypi.douban.com/simple/ salt

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple/ salt
#用你想安装的库名称替换salt，然后复制粘贴放入cmd运行就很愉快地跑起来啦

2020-02-15-10-36-19词云.jpg (246.18 KB, 下载次数: 0)

下载附件保存到相册

2020-2-15 10:36 上传

2020-02-15-10-35-52词云.jpg (209.9 KB, 下载次数: 0)

下载附件保存到相册

2020-2-15 10:36 上传

无标题.png (38.66 KB, 下载次数: 0)

下载附件保存到相册

2020-2-18 22:53 上传

无标题1.png (227.61 KB, 下载次数: 0)

下载附件保存到相册

2020-2-18 22:53 上传

import re,requests,os,time
from lxml import etree
from wordcloud import WordCloud

class make_word_cloud:

    def __init__(self):
        self.search_kw = input('请输入b站搜索关键词：')
        self.page_from = int(input('从第几页开始下载？')) - 1
        self.page_to = int(input('你需要下载到第几页？'))
        self.order_choice = int(input('请输入搜索排序类型==综合排序-1==弹幕排序-2==点击排序-3=='))
        self.duration_choice = int(input('请输入视频时长选择（x分钟以上）==00==10==20==30=='))
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
                                                                                # 修改浏览器头部信息

        self.disk_index = 'F'                                                   #F代表词云文件将要保存到的磁盘的名称
        self.path = '%s:/哔哩哔哩弹幕词云v2.0/%s/' % (self.disk_index,self.search_kw)
                                                                                # self.avurl = input('请输入视频弹幕网址并按回车键')

    def start_all(self):
        video_page_url_list = self.get_video_url_by_kw(self.search_kw,self.page_from,self.page_to,self.order_choice,self.duration_choice)
        count = 1  # 用于查看任务进度的计数，完成第x个视频
        for url in video_page_url_list:
            print(url)                                                          #打印当前爬取的视频地址
            cid_url,title = self.get_cid_url_title(url)                         #获取弹幕xml地址,视频标题
            time.sleep(2)                                                       #睡2秒，避免爬取太快被ban
            danmu_list = self.get_danmu(cid_url)                                #从xml地址提取全部弹幕

            if danmu_list != []:
                danmu_together = ' '.join(danmu_list)                           #该方法从上一个方法中获取到弹幕信息列表，并用join方法做成wordcloud词云需要的形式
                self.make_dir(self.path)                                        #根据输入的地址创建文件
                try:
                    self.word_cloud(danmu_together, self.path,title)            #制作弹幕词云图片
                    print('完成第%s个视频'%(count))
                    count += 1 #计数器加一
                except:
                    print('pass掉第%s个视频'%(count))
                    count += 1 #计数器加一

    def request_decode(self,url):
                                                                                #该方法是decode过的requests值
        r = requests.session()
        response = r.get(url,headers = self.headers)
        content_decode = response.content.decode()
        return content_decode

    def request_nodecode(self,url):
                                                                                # 该方法是未decode过的requests值
        r = requests.session()
        response = r.get(url,headers = self.headers)
        content_nodecode = response.content
        return content_nodecode

    def get_video_url_by_kw(self,kw,page_from,page_to,order_choice,duration_choice):
        if order_choice == 1:
            order = 'totalrank'
        elif order_choice == 2:
            order = 'dm'
        elif order_choice == 3:
            order = 'click'
        if duration_choice == 00:
            duration = 0
        elif duration_choice == 10:
            duration = 1
        elif duration_choice == 20:
            duration = 2
        elif duration_choice == 30:
            duration = 3

        search_page_url_list = []
        for i in range(page_from,page_to):
            search_page_url_list.append('https://search.bilibili.com/all?keyword=%s&page=%s&order=%s&duration=%s&tids_1=0'%(kw,i+1,order,duration))
        print(str(search_page_url_list) + '\n' + '==找到关于%s的%s页搜索数据，进行下一步处理=='%(kw,page_to))
        video_page_url_list = []
        count = 1                                                               #由于第一页xpath路径特殊，采用计数判断页码应对
        for url in search_page_url_list:
            html_data = self.request_nodecode(url)
            tree = etree.HTML(html_data)

            video_url_list1 = tree.xpath('//*[@id="all-list"]/div[1]/div[2]/ul[2]/li/div/div[1]/a/@href')
            video_url_list2 = tree.xpath('//*[@id="all-list"]/div[1]/ul/li/div/div[1]/a/@href')
            video_url_list3 = tree.xpath('//*[@id="all-list"]/div[1]/div[2]/ul/li/div/div[1]/a/@href')

            if video_url_list1 != []:
                video_url_list = tree.xpath('//*[@id="all-list"]/div[1]/div[2]/ul[2]/li/div/div[1]/a/@href')                                    #//*[@id="all-list"]/div[1]/ul/li[1]/div/div[1]/a,//*[@id="all-list"]/div[1]/ul/li[1]/div/div[1]/a
            elif video_url_list2 != []:
                video_url_list = tree.xpath('//*[@id="all-list"]/div[1]/ul/li/div/div[1]/a/@href')                                              #//*[@id="all-list"]/div[1]/ul/li[1]/div/div[1]/a,//*[@id="all-list"]/div[1]/ul/li[1]/div/div[1]/a
            elif video_url_list3 != []:
                video_url_list = tree.xpath('//*[@id="all-list"]/div[1]/div[2]/ul/li/div/div[1]/a/@href')

            for url in video_url_list:
                video_page_url_list.append(url.replace('//','https://'))
                count += 1
        print(str(video_page_url_list) + '\n' + '==已找到%s页数据中的所有视频URL=='%(page_to))
        return video_page_url_list
    def get_cid_url_title(self,avurl):
                                                                                #该方法用于获取一个b站视频连接下的弹幕xml地址
        content = self.request_decode(avurl)
        cid = re.findall('cid=(.*?)&aid=.*&attribute=.*',content)[0]
        title = re.findall('<title data-vue-meta="true">(.*?)</title>',content)[0]
        print('https://api.bilibili.com/x/v1/dm/list.so?oid=%s'%(cid))
        cid_url = {cid:'https://api.bilibili.com/x/v1/dm/list.so?oid=%s'%(cid)}
        return 'https://api.bilibili.com/x/v1/dm/list.so?oid=%s'%(cid),title

    def get_danmu(self,url):
                                                                                #该方法作用于根据弹幕xml地址获取弹幕信息
        content = self.request_nodecode(url)
        data = etree.HTML(content)
        danmu = data.xpath('//d/text()')[:-1]
        print(danmu)
        return danmu

    def make_dir(self,path):

        if not os.path.exists(path):
            os.makedirs(path)
            print('创建目录成功词云已保存至%s'%(path))
        else:
            print('目录存在,词云已保存至%s'%(path))

    def word_cloud(self,text,path,title):
                                                                                #使用wordcloud创建词云图片
        current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
        wdcld = WordCloud(font_path='F:/scrapy/bilibili_danmu/bilibili_danmu/yahei.ttc', scale=20, max_words=100,max_font_size=25,min_font_size=10, prefer_horizontal=1.0)
                                                                                #上面这一行用于设置词云图片的参数，font_path 需要修改成你系统中相应的字体文件，如果没有定义文件，中文将会显示乱码
                                                                                #scale用于设置图片的清晰度，maxwords用于设置显示最大词汇量，prefer_horizontal用于设置横向字符在所有字符中的占比，这里设置为1，意为100%字符都为横向字符
        wdcld.generate(text)
        title = title.replace('<','').replace('>','z').replace('/','z').replace('','z').replace('|','z').replace(':','z').replace('"','z').replace('*','z').replace('?','z').replace('\\','z')
        wdcld.to_file('%s%s词云.jpg' % (path,title))
                                                                                #这里输出词云图片，参数为文件名和相应地址

def exe_it_loop():
#该方法用于程序的循环启动
        try:
            make_word_cloud().start_all()
            print('运行成功')
            exe_it_loop()
        except:
            print('运行错误')

# exe_it_loop()
make_word_cloud().start_all()

版权声明：

本站所有资源均为站长或网友整理自互联网或站长购买自互联网，站长无法分辨资源版权出自何处，所以不承担任何版权以及其他问题带来的法律责任，如有侵权或者其他问题请联系站长删除！站长QQ754403226 谢谢。

有关影视版权：本站只供百度云网盘资源，版权均属于影片公司所有，请在下载后24小时删除，切勿用于商业用途。本站所有资源信息均从互联网搜索而来，本站不对显示的内容承担责任，如您认为本站页面信息侵犯了您的权益，请附上版权证明邮件告知【754403226@qq.com】，在收到邮件后72小时内删除。本文链接：http://www.piaodoo.com/8046.html