很早之前写的简单爬虫直接上代码:
#!/usr/bin/python3
#-*- coding:utf-8 -*-
import requests
import time
from pyquery import PyQuery as pq
from lxml import etree
class QSBK:
def __init__(self):
self.page = 1
self.li = []
self.s = 1
self.picnum = 0
def get_page(self, page):
if page == 1:
url = 'http://www.qiushibaike.com/hot/'
else:
url = 'http://www.qiushibaike.com/hot/page/'+str(page)
# url = 'http://www.qiushibaike.com/imgrank/'+str(page)
r = requests.get(url)
r = requests.get(url)
return r.text
def get_page_items(self):
d = pq(self.get_page(self.page))
es = d(".article, .block, .untagged, .mb15")
picnum = 0
for i in es.items():
if i('.thumb img').attr('src') == None:
#print(i('.content').html())
self.li.append(i('.content').text())
else:
r = requests.get(i('.thumb img').attr('src'),\
stream=True)
self.picnum += 1
with open('./pic2/'+str(self.picnum)+'.jpeg', 'wb') as fd:
for chunk in r.iter_content():
fd.write(chunk)
print('第%s 图片下载成功。' % self.picnum)
def start(self):
for i in range(35): # 目前共 35 页 range(35)
self.get_page_items()
self.page += 1
time.sleep(2)
print('第%s 页文字获取成功。' % self.page)
def write_file(self):
f = open('fun.txt', 'w')
for pre in self.li:
f.write(pre+'\n\n')
f.close()
QS = QSBK()
QS.start()
QS.write_file()
生成文件:

笑话文件:

该程序只为练习爬虫