很早之前写的简单爬虫直接上代码:
#!/usr/bin/python3 #-*- coding:utf-8 -*- import requests import time from pyquery import PyQuery as pq from lxml import etree class QSBK: def __init__(self): self.page = 1 self.li = [] self.s = 1 self.picnum = 0 def get_page(self, page): if page == 1: url = 'http://www.qiushibaike.com/hot/' else: url = 'http://www.qiushibaike.com/hot/page/'+str(page) # url = 'http://www.qiushibaike.com/imgrank/'+str(page) r = requests.get(url) r = requests.get(url) return r.text def get_page_items(self): d = pq(self.get_page(self.page)) es = d(".article, .block, .untagged, .mb15") picnum = 0 for i in es.items(): if i('.thumb img').attr('src') == None: #print(i('.content').html()) self.li.append(i('.content').text()) else: r = requests.get(i('.thumb img').attr('src'),\ stream=True) self.picnum += 1 with open('./pic2/'+str(self.picnum)+'.jpeg', 'wb') as fd: for chunk in r.iter_content(): fd.write(chunk) print('第%s 图片下载成功。' % self.picnum) def start(self): for i in range(35): # 目前共 35 页 range(35) self.get_page_items() self.page += 1 time.sleep(2) print('第%s 页文字获取成功。' % self.page) def write_file(self): f = open('fun.txt', 'w') for pre in self.li: f.write(pre+'\n\n') f.close() QS = QSBK() QS.start() QS.write_file()
生成文件:
笑话文件:
该程序只为练习爬虫