8.线程池在爬虫案例中的应用
9.协程
4.动作链与iframe的处理
5.谷歌无头浏览器+反检测
1.各种项目实战,scrapy各种配置修改
4.数据库示例
第0关 认识爬虫
1、初始爬虫
爬虫,从本质上来说,就是利用程序在网上拿到对我们有价值的数据。
2、明晰路径
2-1、浏览器工作原理
(1)解析数据:当服务器把数据响应给浏览器之后,浏览器并不会直接把数据丢给我们。因为这些数据是用计算机的语言写的,浏览器还要把这些数据翻译成我们能看得懂的内容;
(2)提取数据:我们就可以在拿到的数据中,挑选出对我们有用的数据;
(3)存储数据:将挑选出来的有用数据保存在某一文件/数据库中。
2-2、爬虫工作原理
(1)获取数据:爬虫程序会根据我们提供的网址,向服务器发起请求,然后返回数据;
(2)解析数据:爬虫程序会把服务器返回的数据解析成我们能读懂的格式;
(3)提取数据:爬虫程序再从中提取出我们需要的数据;
(4)储存数据:爬虫程序把这些有用的数据保存起来,便于你日后的使用和分析。
————————————————
原创,昵称就是「yk 坤帝」
第二章:requests实战(基础爬虫)
1.豆瓣电影爬取
```python
import requests
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
params = {
'type': '24',
'interval_id': '100:90',
'action': '',
'start': '0',#从第几部电影开始取
'limit': '20'#一次取出的电影的个数
print('over!!!!')
```
2.肯德基餐厅查询
```python
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
word = input('请输入一个地址:')
params = {
'cname': '',
'pid': '',
'keyword': word,
'pageIndex': '1',
'pageSize': '10'
fileName = word + '.txt'
with open(fileName,'w',encoding= 'utf-8') as f:
```
3.破解百度翻译
```python
import requests
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
word = input('enter a word:')
data = {
'kw':word
fileName = word + '.json'
fp = open(fileName,'w',encoding= 'utf-8')
#ensure_ascii = False,中文不能用ascii代码
print('over!')
```
4.搜狗首页
```python
import requests
print(page_text)
print('爬取数据结束!!!')
```
5.网页采集器
```python
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
kw = input('enter a word:')
param = {
'query':kw
fileName = kw +'.html'
with open(fileName,'w',encoding= 'utf-8') as fp:
print(fileName,'保存成功!!!')
```
```python
import requests
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4385.0 Safari/537.36'
for page in range(1,6):
page = str(page)
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName':'',
'conditionType': '1',
'applyname': '',
'applysn':''
id_list = []
for dic in json_ids['list']:
#print(id_list)
all_data_list = []
for id in id_list:
data = {
'id':id
#print(datail_json,'---------------------over')
print('over!!!')
```
```python
from bs4 import BeautifulSoup
soup = BeautifulSoup(fp,'lxml')
#print(soup)
#print(soup.a)
```
```python
from bs4 import BeautifulSoup
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
#print(page_text)
soup = BeautifulSoup(page_text,'lxml')
for li in li_list:
#print(title)
print(detail_url)
detail_soup = BeautifulSoup(detail_page_text,'lxml')
print(title,'爬取成功!!!')
```
```python
from lxml import etree
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
# print(r)
```
```python
import requests
from lxml import etree
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
for li in li_list:
# print(img_name,img_src)
# print(type(img_name))
img_path ='picLibs/'+img_name
#print(img_path)
with open(img_path,'wb') as fp:
print(img_name,"下载成功")
```
```python
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
for li in li_list:
print(title)
```
```python
import requests
from lxml import etree
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
```
```python
import requests
from lxml import etree
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
# all_city_name = []
# for li in holt_li_list:
# for li in city_name_list:
# print(all_city_name,len(all_city_name))
all_city_name = []
for li in holt_li_list:
print(all_city_name,len(all_city_name))
```
8.正则解析
```python
import requests
import re
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4385.0 Safari/537.36'
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
print(img_src_list)
for src in img_src_list:
imgPath = './qiutuLibs/'+img_name
with open(imgPath,'wb') as fp:
print(img_name,"下载完成!!!!!")
```
9.正则解析-分页爬取
```python
import requests
import re
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4385.0 Safari/537.36'
for pageNum in range(1,3):
new_url = format(url%pageNum)
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
print(img_src_list)
for src in img_src_list:
imgPath = './qiutuLibs/'+img_name
with open(imgPath,'wb') as fp:
print(img_name,"下载完成!!!!!")
```
10.爬取图片
```python
import requests
```
1.古诗文网验证码识别
开发者账号密码可以申请
```python
import requests
from lxml import etree
from fateadm_api import FateadmApi
def TestFunc(imgPath,codyType):
pd_id = "xxxxxx" #用户中心页可以查询到pd信息
pd_key = "xxxxxxxx"
app_id = "xxxxxxx" #开发者分成用的账号,在开发者中心可以查询到
app_key = "xxxxxxx"
#识别类型,
#具体类型可以查看官方网站的价格页选择具体的类型,不清楚类型的,可以咨询客服
pred_type = codyType
api = FateadmApi(app_id, app_key, pd_id, pd_key)
# 查询余额
# 通过文件形式识别:
file_name = imgPath
'''
# 如果不是通过文件识别,则调用Predict接口:
'''
# just_flag = False
# if just_flag :
# #识别的结果如果与预期不符,可以调用这个接口将预期不符的订单退款
# # 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
#card_id = "123"
#card_key = "123"
#充值
#LOG("print in testfunc")
return result
# if __name__ == "__main__":
# TestFunc()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
print('识别结果为:' + code_text)
print('识别结果为:' + code_text)
```
调用api接口
```python
# coding=utf-8
import os,sys
import hashlib
import time
import json
import requests
def LOG(log):
# 不需要测试时,注释掉日志就可以了
print(log)
log = None
class TmpObj():
def __init__(self):
class Rsp():
def __init__(self):
def ParseJsonRsp(self, rsp_data):
if rsp_data is None:
return
rslt_data = jrsp["RspData"]
if rslt_data is not None and rslt_data != "":
if "cust_val" in jrsp_ext:
data = jrsp_ext["cust_val"]
if "result" in jrsp_ext:
data = jrsp_ext["result"]
def CalcSign(pd_id, passwd, timestamp):
return csign
def CalcCardSign(cardid, cardkey, timestamp, passwd):
def HttpRequest(url, body_data, img_data=""):
rsp = Rsp()
post_data = body_data
files = {
'img_data':('img_data',img_data)
header = {
'User-Agent': 'Mozilla/5.0',
return rsp
class FateadmApi():
# API接口调用类
# 参数(appID,appKey,pdID,pdKey)
def __init__(self, app_id, app_key, pd_id, pd_key):
if app_id is None:
def SetHost(self, url):
# 查询余额
# 参数:无
# 返回值:
def QueryBalc(self):
param = {
"timestamp":tm,
"sign":sign
rsp = HttpRequest(url, param)
else:
return rsp
# 查询网络延迟
# 参数:pred_type:识别类型
# 返回值:
def QueryTTS(self, pred_type):
param = {
"timestamp":tm,
"sign":sign,
"predict_type":pred_type,
param["asign"] = asign
rsp = HttpRequest(url, param)
else:
return rsp
# 识别验证码
# 参数:pred_type:识别类型 img_data:图片的数据
# 返回值:
def Predict(self, pred_type, img_data, head_info = ""):
param = {
"timestamp": tm,
"sign": sign,
"predict_type": pred_type,
"up_type": "mt"
if head_info is not None or head_info != "":
param["head_info"] = head_info
param["asign"] = asign
files = img_data
rsp = HttpRequest(url, param, files)
else:
#lack of money
LOG("cust_val <= 0 lack of money, please charge immediately")
return rsp
# 从文件进行验证码识别
# 参数:pred_type;识别类型 file_name:文件名
# 返回值:
def PredictFromFile( self, pred_type, file_name, head_info = ""):
with open(file_name, "rb") as f:
# 识别失败,进行退款请求
# 参数:request_id:需要退款的订单号
# 返回值:
# 注意:
# Predict识别接口,仅在ret_code == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
# 注意2:
# 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
def Justice(self, request_id):
if request_id == "":
return
param = {
"timestamp":tm,
"sign":sign,
"request_id":request_id
rsp = HttpRequest(url, param)
else:
return rsp
# 充值接口
# 参数:cardid:充值卡号 cardkey:充值卡签名串
# 返回值:
def Charge(self, cardid, cardkey):
param = {
"timestamp":tm,
"sign":sign,
'cardid':cardid,
'csign':csign
rsp = HttpRequest(url, param)
else:
return rsp
##
# 充值,只返回是否成功
# 参数:cardid:充值卡号 cardkey:充值卡签名串
# 返回值: 充值成功时返回0
##
def ExtendCharge(self, cardid, cardkey):
##
# 调用退款,只返回是否成功
# 参数: request_id:需要退款的订单号
# 返回值: 退款成功时返回0
# 注意:
# Predict识别接口,仅在ret_code == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
# 注意2:
# 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
##
def JusticeExtend(self, request_id):
##
# 查询余额,只返回余额
# 参数:无
##
def QueryBalcExtend(self):
##
# 从文件识别验证码,只返回识别结果
# 参数:pred_type;识别类型 file_name:文件名
##
def PredictFromFileExtend( self, pred_type, file_name, head_info = ""):
##
# 识别接口,只返回识别结果
# 参数:pred_type:识别类型 img_data:图片的数据
##
def PredictExtend(self,pred_type, img_data, head_info = ""):
def TestFunc():
pd_id = "128292" #用户中心页可以查询到pd信息
pd_key = "bASHdc/12ISJOX7pV3qhPr2ntQ6QcEkV"
app_id = "100001" #开发者分成用的账号,在开发者中心可以查询到
app_key = "123456"
#识别类型,
#具体类型可以查看官方网站的价格页选择具体的类型,不清楚类型的,可以咨询客服
pred_type = "30400"
api = FateadmApi(app_id, app_key, pd_id, pd_key)
# 查询余额
# 通过文件形式识别:
'''
# 如果不是通过文件识别,则调用Predict接口:
'''
just_flag = False
if just_flag :
#识别的结果如果与预期不符,可以调用这个接口将预期不符的订单退款
# 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
#card_id = "123"
#card_key = "123"
#充值
LOG("print in testfunc")
if __name__ == "__main__":
TestFunc()
```
1.代理操作
```python
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
```
2.模拟登陆人人网
```python
import requests
from lxml import etree
from fateadm_api import FateadmApi
def TestFunc(imgPath,codyType):
pd_id = "xxxxx" #用户中心页可以查询到pd信息
pd_key = "xxxxxxxxxxxxxxxxxx"
app_id = "xxxxxxxx" #开发者分成用的账号,在开发者中心可以查询到
app_key = "xxxxxx"
#识别类型,
#具体类型可以查看官方网站的价格页选择具体的类型,不清楚类型的,可以咨询客服
pred_type = codyType
api = FateadmApi(app_id, app_key, pd_id, pd_key)
# 查询余额
# 通过文件形式识别:
file_name = imgPath
'''
# 如果不是通过文件识别,则调用Predict接口:
'''
# just_flag = False
# if just_flag :
# #识别的结果如果与预期不符,可以调用这个接口将预期不符的订单退款
# # 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
#card_id = "123"
#card_key = "123"
#充值
#LOG("print in testfunc")
return result
# if __name__ == "__main__":
# TestFunc()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
print('识别结果为:' + result)
data = {
'email':'xxxxxxxx',
'icode': result,
'key_id': '1',
'captcha_type':' web_login',
'password': '47e27dd5ef32b31041ebf56ec85a9b1e4233875e36396241c88245b188c56cdb',
'rkey': 'c655ef0c57a72755f1240d6c0efac67d',
'f': ''
```
```python
# coding=utf-8
import os,sys
import hashlib
import time
import json
import requests
def LOG(log):
# 不需要测试时,注释掉日志就可以了
print(log)
log = None
class TmpObj():
def __init__(self):
class Rsp():
def __init__(self):
def ParseJsonRsp(self, rsp_data):
if rsp_data is None:
return
rslt_data = jrsp["RspData"]
if rslt_data is not None and rslt_data != "":
if "cust_val" in jrsp_ext:
data = jrsp_ext["cust_val"]
if "result" in jrsp_ext:
data = jrsp_ext["result"]
def CalcSign(pd_id, passwd, timestamp):
return csign
def CalcCardSign(cardid, cardkey, timestamp, passwd):
def HttpRequest(url, body_data, img_data=""):
rsp = Rsp()
post_data = body_data
files = {
'img_data':('img_data',img_data)
header = {
'User-Agent': 'Mozilla/5.0',
return rsp
class FateadmApi():
# API接口调用类
# 参数(appID,appKey,pdID,pdKey)
def __init__(self, app_id, app_key, pd_id, pd_key):
if app_id is None:
def SetHost(self, url):
# 查询余额
# 参数:无
# 返回值:
def QueryBalc(self):
param = {
"timestamp":tm,
"sign":sign
rsp = HttpRequest(url, param)
else:
return rsp
# 查询网络延迟
# 参数:pred_type:识别类型
# 返回值:
def QueryTTS(self, pred_type):
param = {
"timestamp":tm,
"sign":sign,
"predict_type":pred_type,
param["asign"] = asign
rsp = HttpRequest(url, param)
else:
return rsp
# 识别验证码
# 参数:pred_type:识别类型 img_data:图片的数据
# 返回值:
def Predict(self, pred_type, img_data, head_info = ""):
param = {
"timestamp": tm,
"sign": sign,
"predict_type": pred_type,
"up_type": "mt"
if head_info is not None or head_info != "":
param["head_info"] = head_info
param["asign"] = asign
files = img_data
rsp = HttpRequest(url, param, files)
else:
#lack of money
LOG("cust_val <= 0 lack of money, please charge immediately")
return rsp
# 从文件进行验证码识别
# 参数:pred_type;识别类型 file_name:文件名
# 返回值:
def PredictFromFile( self, pred_type, file_name, head_info = ""):
with open(file_name, "rb") as f:
# 识别失败,进行退款请求
# 参数:request_id:需要退款的订单号
# 返回值:
# 注意:
# Predict识别接口,仅在ret_code == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
# 注意2:
# 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
def Justice(self, request_id):
if request_id == "":
return
param = {
"timestamp":tm,
"sign":sign,
"request_id":request_id
rsp = HttpRequest(url, param)
else:
return rsp
# 充值接口
# 参数:cardid:充值卡号 cardkey:充值卡签名串
# 返回值:
def Charge(self, cardid, cardkey):
param = {
"timestamp":tm,
"sign":sign,
'cardid':cardid,
'csign':csign
rsp = HttpRequest(url, param)
else:
return rsp
##
# 充值,只返回是否成功
# 参数:cardid:充值卡号 cardkey:充值卡签名串
# 返回值: 充值成功时返回0
##
def ExtendCharge(self, cardid, cardkey):
##
# 调用退款,只返回是否成功
# 参数: request_id:需要退款的订单号
# 返回值: 退款成功时返回0
# 注意:
# Predict识别接口,仅在ret_code == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
# 注意2:
# 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
##
def JusticeExtend(self, request_id):
##
# 查询余额,只返回余额
# 参数:无
##
def QueryBalcExtend(self):
##
# 从文件识别验证码,只返回识别结果
# 参数:pred_type;识别类型 file_name:文件名
##
def PredictFromFileExtend( self, pred_type, file_name, head_info = ""):
##
# 识别接口,只返回识别结果
# 参数:pred_type:识别类型 img_data:图片的数据
##
def PredictExtend(self,pred_type, img_data, head_info = ""):
def TestFunc():
pd_id = "128292" #用户中心页可以查询到pd信息
pd_key = "bASHdc/12ISJOX7pV3qhPr2ntQ6QcEkV"
app_id = "100001" #开发者分成用的账号,在开发者中心可以查询到
app_key = "123456"
#识别类型,
#具体类型可以查看官方网站的价格页选择具体的类型,不清楚类型的,可以咨询客服
pred_type = "30400"
api = FateadmApi(app_id, app_key, pd_id, pd_key)
# 查询余额
# 通过文件形式识别:
'''
# 如果不是通过文件识别,则调用Predict接口:
'''
just_flag = False
if just_flag :
#识别的结果如果与预期不符,可以调用这个接口将预期不符的订单退款
# 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
#card_id = "123"
#card_key = "123"
#充值
LOG("print in testfunc")
if __name__ == "__main__":
TestFunc()
```
3.爬取人人网当前用户的个人详情页数据
```python
import requests
from lxml import etree
from fateadm_api import FateadmApi
def TestFunc(imgPath,codyType):
pd_id = "xxxxxxx" #用户中心页可以查询到pd信息
pd_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
app_id = "xxxxxxxx" #开发者分成用的账号,在开发者中心可以查询到
app_key = "xxxxxxxxx"
#识别类型,
#具体类型可以查看官方网站的价格页选择具体的类型,不清楚类型的,可以咨询客服
pred_type = codyType
api = FateadmApi(app_id, app_key, pd_id, pd_key)
# 查询余额
# 通过文件形式识别:
file_name = imgPath
'''
# 如果不是通过文件识别,则调用Predict接口:
'''
# just_flag = False
# if just_flag :
# #识别的结果如果与预期不符,可以调用这个接口将预期不符的订单退款
# # 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
#card_id = "123"
#card_key = "123"
#充值
#LOG("print in testfunc")
return result
# if __name__ == "__main__":
# TestFunc()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
print('识别结果为:' + result)
data = {
'email':'15893301681',
'icode': result,
'key_id': '1',
'captcha_type':' web_login',
'password': '47e27dd5ef32b31041ebf56ec85a9b1e4233875e36396241c88245b188c56cdb',
'rkey': 'c655ef0c57a72755f1240d6c0efac67d',
'f': '',
# headers = {
# 'Cookies'
# }
```
```python
import requests
import asyncio
import time
urls = [
async def get_page(url):
#print('正在下载',url)
print(page_text)
tasks = []
for url in urls:
c = get_page(url)
print('总耗时',end - start)
```
```python
from flask import Flask
import time
app = Flask(__name__)
def index_bobo():
return 'Hello bobo'
def index_jay():
return 'Hello jay'
def index_tom():
return 'Hello tom'
if __name__ == '__main__':
```
3.多任务协程
```python
import asyncio
import time
async def request(url):
print('正在下载',url)
print('下载完成',url)
stasks = []
for url in urls:
c = request(url)
```
4.多任务异步爬虫
```python
import requests
import asyncio
import time
urls = [
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
async def get_page(url):
print('正在下载',url)
tasks = []
for url in urls:
c = get_page(url)
print('总耗时',end - start)
```
5.示例
```python
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
```
6.同步爬虫
```python
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
urls = [
def get_content(url):
print('正在爬取:',url)
def parse_content(content):
print('响应数据的长度为:',len(content))
for url in urls:
content = get_content(url)
parse_content(content)
```
7.线程池基本使用
```python
# import time
# def get_page(str):
# print('正在下载:',str)
# print('下载成功:',str)
# name_list = ['xiaozi','aa','bb','cc']
# for i in range(len(name_list)):
# get_page(name_list[i])
# print('%d second'%(end_time-start_time))
import time
def get_page(str):
print('正在下载:',str)
print('下载成功:',str)
name_list = ['xiaozi','aa','bb','cc']
pool = Pool(4)
print(end_time-start_time)
```
8.线程池在爬虫案例中的应用
```python
import requests
from lxml import etree
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
urls = []
for li in li_list:
#print(detail_url,name)
# ex = 'srcUrl="(.*?)",vdoUrl'
#xhrm码
print(detail_page_text)
'''
dic = {
'name':name,
'url':video_url
def get_video_data(dic):
url = dic['url']
print(dic['name'],'正在下载......')
with open(dic['name','w']) as fp:
print(dic['name'],'下载成功!')
pool = Pool(4)
'''
```
9.协程
```python
import asyncio
async def request(url):
print('正在请求的url是',url)
print('请求成功,',url)
return url
# print(task)
# print(task)
# print(task)
# print(task)
def callback_func(task):
```
```python
from selenium import webdriver
from lxml import etree
from time import sleep
for li in li_list:
print(name)
sleep(5)
```
```python
from selenium import webdriver
from lxml import etree
from time import sleep
sleep(2)
sleep(2)
# sleep(5)
print(type(btn))
sleep(2)
sleep(2)
sleep(5)
```
```python
# 大二
from selenium import webdriver
import time
from PIL import Image
# chrome_options = Options()
# option = ChromeOptions()
print('location:',location)
print('size',size)
rangle = (
int(location['x']),int(location['y']),int(location['x'] + int(size['width'])),int(location['y']+int(size['height']))
print(rangle)
# 大二
#验证码坐标无法准确识别,坐标错位,使用无头浏览器可以识别
'''
all_list = []
if '|' in result:
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
else:
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
print(all_list)
for l in all_list:
x = l[0]
y = l[1]
ActionChains(bro).move_to_element_with_offset(code_img_ele,x,y).click().perform()
'''
```
4.动作链与iframe的处理
```python
from selenium import webdriver
from time import sleep
action = ActionChains(bro)
for i in range(5):
sleep(0.3)
print(div)
```
5.谷歌无头浏览器+反检测
```python
from selenium import webdriver
from time import sleep
chrome_options = Options()
option = ChromeOptions()
sleep(2)
```
```python
#2021年2.18
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
def PostPic(self, im, codetype):
"""
im: 图片字节
"""
params = {
'codetype': codetype,
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
# if __name__ == '__main__':
# chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001')
# chaojiying = Chaojiying_Client('xxxxxxxxxx', 'xxxxxxxxxx', 'xxxxxxx')
from selenium import webdriver
import time
```
```python
from selenium import webdriver
from time import sleep
#account = input('请输入账号:')
#password = input('请输入密码:')
```
## 第八章:scrapy框架
1.各种项目实战,scrapy各种配置修改
```python
# 大二
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
print(tree)
print(li_list)
for li in li_list:
print(job_name)
```
```python
# -*- coding: utf-8 -*-
# 大二
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'