通过代码实现抓取个人博客中某一页指定文章链接,并批量将该链接推送到百度站长平台,起到快速收录的目的。
import sys import requests from bs4 import BeautifulSoup
def push_page(url): headers = { 'User-Agent': 'curl/7.12.1', 'Host': 'data.zz.baidu.com', 'Content-Type': 'text/plain', 'Content-Length': '83' } urls = "http://data.zz.baidu.com/urls?site=https://www.lyshark.com&token=MpHvVKjbs10XqaW" try: html = requests.post(urls, headers=headers, data=url, timeout=5).text push_status = eval(html)['success'] if push_status == 1: return 1 else: return 0 except: return 0
def get_page(page): html = requests.get(page,timeout=5).text try: bs = BeautifulSoup(html,"html.parser") ret = bs.select('div[class="container"] div[class="row"] h2[class="post-title"] a') for item in ret: push_url = item.get('href') push_ref = push_page(push_url) print("推送: {} --> 状态: {}".format(push_url,push_ref)) return 1 except: return 0
if __name__ == "__main__": arg = sys.argv get_page(arg[1])
|
2.0批量推送
import requests from bs4 import BeautifulSoup
def push_page(url): headers = { 'User-Agent': 'curl/7.12.1', 'Host': 'data.zz.baidu.com', 'Content-Type': 'text/plain-t', 'Content-Length': '83' } urls = "http://data.zz.baidu.com/urls?site=https://www.lyshark.com&token=C5pA6XTWlCxdCwB" try: html = requests.post(urls, headers=headers, data=url, timeout=5).text push_status = eval(html)['success'] if push_status == 1: print("推送页面: {} 推送状态: {}".format(url,push_status)) return 1 else: print("推送页面: {} 推送状态: {}".format(url, push_status)) return 0 except: return 0
def get_page(page): push_url_list = [] html = requests.get(page,timeout=5).text try: bs = BeautifulSoup(html,"html.parser") ret = bs.select('div[class="container"] div[class="row"] h2[class="post-title"] a') for item in ret: push_url = item.get('href') print("提取链接: {}".format(push_url)) push_url_list.append(push_url) return push_url_list except: return 0
def create_page(start,end): page_list = [] for ea in range(start,end+1): page = f"https://www.lyshark.com/page/{ea}/" print("创建页面链接: {}".format(page)) page_list.append(page) return page_list
if __name__ == "__main__": while True: push_url = [] page = create_page(1,15) for each in page: ref = get_page(each) push_url.extend(ref)
for url in push_url: push_page(url)
|