您的位置: 网站首页> SEO工具> 当前文章
python采集百度Mo相关搜索及其他人还在搜
老董-我爱我家房产SEO2020-09-19198围观,138赞
采集百度移动端相关词 + 其他人还在搜 这两类词。
采集结果保存2个文件,一个是原始文件,一个是去重计算出每个词重复次数的文件。线程数默认是1,现在百度反爬比之前严重!线程最好是1。【此外,多线程写同一个文件需要加锁否则可能数据错乱】
# ‐*‐ coding: utf‐8 ‐*‐ """ 百度移动端相关词和其他人还在搜采集 将结果去重并统计重复次数排序 bdmo_xgqita.txt为采集到的词 bdmo_xgqita_sort.txt为排序后的词 """ # python3.6版本 import requests import threading import queue from pyquery import PyQuery as pq import time # 获取某词serp源码 def get_html(url,my_header,retry=1): try: r = requests.get(url=url,headers=my_header,timeout=5) except Exception as e: print('获取源码失败',e) time.sleep(6) if retry > 0: get_html(url,my_header,retry-1) else: html = r.content.decode('utf-8',errors='ignore') # 用r.text有时候识别错误 url = r.url # 反爬会重定向,取定向后的地址 return html,url # 提取相关词+其他人还在搜 def get_kwds(html,url): xg_kwds = [] qita_kwds = [] doc = pq(html) title = doc('title').text() if '- 百度' in title and 'https://m.baidu.com/s?ie=utf-8' in url: # 获取相关搜索 xg_list = doc('.rw-list a').items() for a in xg_list: text = a.text() xg_kwds.append(text) # 获取其他人还在搜 div_list =doc('.c-result').items() for div in div_list: if div.attr('tpl') == 'recommend_list': a_list = div('section .c-row a').items() for a in a_list: text = a.text() qita_kwds.append(text) break # 找到目标div后就停止 else: print('源码error',url) time.sleep(100) xg_kwds.extend(qita_kwds) return xg_kwds # 结果排序 def sort_dict(dicts_kwd): res = sorted(dicts_kwd.items(), key=lambda e:e[1], reverse=True) for kwd,count in res: f_count.write('{0} {1} '.format(kwd,count)) f_count.flush() def main(): while 1: kwd = q.get() url = "https://m.baidu.com/s?ie=utf-8&word={0}".format(kwd) try: print(url) html,now_url = get_html(url,my_header) kwds = get_kwds(html,now_url) print(kwd,len(kwds)) for kwd in kwds: dicts_kwd[kwd] = dicts_kwd[kwd] + 1 if kwd in dicts_kwd else 1 f.write(kwd + ' ') f.flush() except Exception as e: print(e) finally: q.task_done() if __name__ == "__main__": dicts_kwd = {} # 结果保存文件 f = open('bdmo_xgqita.txt','w',encoding='utf-8') # 结果保存文件-排序 f_count = open('bdmo_xgqita_sort.txt','w',encoding='utf-8') # head设置 my_header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding':'deflate', 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control':'no-cache', 'Connection':'keep-alive', 'Cookie':'cuid=9D9A0DBD27C85A299930263FF6BE52CB;BAIDU_WISE_UID=wpass_1557773074883_645;HISTORY=7538961407b87ee55f98de183cdc4bf7e4a758d55c6bb28493;BAIDUID=AC1D01EB26A43281B80C01932C3B1B04:FG=1;STOKEN=18f5c4868f05de8b634e97c53240b53075a1259fd6c10d0ed69fdeb1898385e8;SAVEUSERID=81db925c8ee9433ed26e5949342dda8b732ee6b86d;PTOKEN=a706047f3786779f85f752cc0b76b51b;BDUSS=29GYXlkb2tORTd3MkJCVG02ejlUUHJPMVRnSGZ6ekN6VVF-UjcyZVFZQXpTQUZkSVFBQUFBJCQAAAAAAAAAAAEAAAADZvwsv76w~NfT1tfDtMHLAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADO72Vwzu9lcd;PASSID=FuYK;UBI=fi_PncwhpxZ%7ETaJc0sNwb%7ENx8KDCUsDofWK6%7Ezu9rupW0C6n2vvtflUqpArFgdr87G2TYzPXaQXGWs2WXC2', 'Host':'m.baidu.com', 'Pragma':'no-cache', 'Referer':'https://m.baidu.com/', 'Sec-Fetch-Mode':'navigate', 'Sec-Fetch-Site':'same-origin', 'Sec-Fetch-User':'?1', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36',} # 关键词队列 q = queue.Queue() for kwd in open('kwd.txt','r',encoding='utf-8'): kwd = kwd.strip() q.put(kwd) # 设置线程数 for i in list(range(1)): t = threading.Thread(target=main) t.setDaemon(True) t.start() q.join() sort_dict(dicts_kwd) f.flush() f.close() f_count.close()
很赞哦!
python编程网提示:转载请注明来源www.python66.com。
有宝贵意见可添加站长微信(底部),获取技术资料请到公众号(底部)。同行交流请加群
相关文章
文章评论
-
python采集百度Mo相关搜索及其他人还在搜文章写得不错,值得赞赏
站点信息
- 网站程序:Laravel
- 客服微信:a772483200