来源:python中国网 时间:2019-11-13

  不同人做seo有不同的思路,所以总有些乱七八糟的需求,比如采集百度搜索结果页的真实url。很多SEO工具实现的过程也需要采集真实url,下面的代码就是采集百度PC排名的真实url。

  1、准备关键词文件kwd.txt(一行一个关键词)。

  2、结果会保存成real_url.txt。

# ‐*‐ coding: utf‐8 ‐*‐
# python3.7版本
import requests
import threading
import queue
from pyquery import PyQuery as pq

# 获取某词serp源码
def get_html(url,retry=2):
    try:
        r = requests.get(url=url,headers=user_agent, timeout=5)
    except Exception as e:
        print('获取源码失败', url, e)
        if retry > 0:
            get_html(url, retry - 1)
    else:
        html = r.content.decode('utf-8') # 直接 r.text有时识别错误
        return html


# 获取某词serp源码上自然排名的所有url
def get_encrpt_urls(html):
    encrypt_url_list = []
    if html and '_百度搜索' in html:
        doc = pq(html)
        try:
            a_list = doc('.t a').items()
        except Exception as e:
            print('未提取到serp上的解密url', e, url)
        else:
            for a in a_list:
                encrypt_url = a.attr('href')
                if encrypt_url.find('http://www.baidu.com/link?url=') == 0:
                    encrypt_url_list.append(encrypt_url)
    return encrypt_url_list


# 解密某条加密url
def decrypt_url(encrypt_url, retry=1):
    try:
        encrypt_url = encrypt_url.replace('http://', 'https://')
        r = requests.head(encrypt_url, headers=user_agent)
    except Exception as e:
        print(encrypt_url, '解密失败', e)
        if retry > 0:
            decrypt_url(encrypt_url, retry - 1)
    else:
        return r.headers['Location']


# 获取某词serp源码首页排名真实url
def get_real_urls():
    while 1:
        kwd = q.get()
        try:
            url = 'https://www.baidu.com/s?wd={0}'.format(kwd)
            html = get_html(url)
            encrypt_url_list = get_encrpt_urls(html)
            if encrypt_url_list:
                real_url_list = [decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list]
                for url in real_url_list:
                    print(url)
                    f.write(str(url)+'
')
            else:
                print('未提取到serp上的加密url')
            del kwd
        except Exception as e:
            print(e)
        finally:
            q.task_done()

if __name__ == "__main__":
    # 结果保存文件
    f = open('bdpc_real_url.txt','w',encoding='utf-8')
    # UA设置
    user_agent = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    # 关键词队列
    q = queue.Queue()
    for kwd in open('kwd.txt',encoding='utf-8'):
        kwd = kwd.strip()
        q.put(kwd)
    # 线程数
    for i in list(range(5)):
        t = threading.Thread(target=get_real_urls)
        t.setDaemon(True)
        t.start()
    q.join()
    f.flush()
    f.close()

http://as.58.com/baoma/
http://anshan.ganji.com/bmw/
https://anshan.baixing.com/ershouqiche/m7890/
http://as.58.com/baomawuxi/
https://m.58.com/as/baomacar/
https://www.iautos.cn/2scbrand-anshanbaoma5xi/
https://www.che168.com/anshan/baoma/
https://www.guazi.com/anshan/bmw/
http://map.baidu.com/?newmap=1&s=con%26wd%3D%E9%9E%8D%E5%B1%B1%E4%BA%8C%E6%89%8B%E5%AE%9D%E9%A9%AC%26c%3D131&from=alamap&tpl=mapdots
http://3g.ganji.com/anshan_bmw/
https://www.guazi.com/anshan/bmw/
http://as.58.com/baoma/
http://anshan.ganji.com/bmw/
https://anshan.baixing.com/ershouqiche/m7890/
https://www.iautos.cn/2scbrand-anshanbaomaacbaomajinkou/
https://www.58.com/baoma/
https://www.che168.com/anshan/baoma/baoma5xi/
https://m.iautos.cn/anshan/all-baoma3xi/
http://3g.ganji.com/anshan_bmw/
https://m.58.com/as/baoma/


  采集百度PC排名的真实url代码大家复制下来测试一把,有问题反馈哦。