python 爬取本博客文章源码



import lxml
import urllib.request
from bs4 import BeautifulSoup

headerss = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}


def data_requests(main_data):
list1 = []
list2 = []
for i in main_data:
    a = urllib.request.Request(i["href"], headers=headerss)
    data = urllib.request.urlopen(a)
    data = data.read()
    data = data.decode()
    soup = BeautifulSoup(data, "lxml")
    hx = soup.select("article[class='hentry'] h1[class='entry-title']")
    list1.append(hx)
    hd = soup.select("div[id='primary'] div[class='entry-content']")
    list2.append(hd)

for i in range(0, len(list2) - 1):
    # print(list1[i][0].text)
    p=open(list1[i][0].text+".html","w")
    p.write(str(list2[i][0]))





def mian():
url = "https://www.idle.fit/"

a = urllib.request.Request(url, headers=headerss)
data = urllib.request.urlopen(a)

data = data.read()
data = data.decode()
soup = BeautifulSoup(data, "lxml")
main_data = soup.select("main[class='site-main indexMain'] h1[class='entry-title'] a")

return data_requests(main_data)


if __name__ == '__main__':

声明:木东蓝博客|版权所有,违者必究|如未注明,均为原创|本网站采用BY-NC-SA协议进行授权

转载:转载请注明原文链接 - python 爬取本博客文章源码


学习新思想,争做新青年!