1. Python爬虫scrapy框架
Python作为目前最具代表性的高级程序设计语言,其应用领域越发广泛,以Web爬虫为例,Scrapy框架在各大公司的爬虫工程师中被广泛使用。Scrapy框架是一个为了爬取网站数据,提取结构性数据而编写的应用框架。
2. Cookie池
在爬虫进行数据抓取的过程中,一般网站均会通过 Cookie 记录每个用户的身份标识和浏览记录,识别是否为已知用户,从而展示相应的内容。所以,在开发 Web 爬虫的时候,模拟浏览器并制作一个完整的Cookie池,就变得至关重要。
2.1 Cookie的设置
设置需要爬取的网站的cookie,在 settings.py
中设置:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Referer': 'http://weibo.cn/search/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
'''为了绕过防爬虫机制,使用代理ip进行请求'''
# 将代理IP列表存储在proxy_list.txt中
PROXY_LIST = '/path/to/proxy_list.txt'
2.2连接Redis数据库
安装Redis,Python模拟登录时可以使用Redis构建Cookie池,具体方法可以通过auth_user函数从Redis中获取cookie。
import redis
class CookiesPool(object):
def __init__(self):
self.__conn = redis.Redis(host='localhost', port=6379, db=0, password='******')
# 获取随机Cookies
def random_cookies(self):
cookies = self.__conn.randomkey()
if cookies:
return eval(self.__conn.get(cookies))
else:
return None
2.3 使用多进程
为了加快Cookie的获取速度,可以考虑使用多进程来进行,具体步骤如下:
在cookies_middleware.py
中导入multiprocessing模块
在引擎类中的init函数中指定进程数量,启动进程池
在HTTP代理中传入进程池实例
import multiprocessing as mp
class WeiboSpiderMiddleware(object):
def __init__(self):
self.cookiesPool = CookiesPool()
self.lock = mp.Lock()
self.pool = mp.Pool(processes=1)
def process_request(self, request, spider):
'''添加代理IP头部数据到request'''
if 'weibo.cn' in request.url:
cookies = self.cookiesPool.random_cookies()
if cookies:
request.cookies = cookies
return None
self.lock.acquire() # 获取锁
result_async = self.pool.apply_async(self.__auth_user) # 异步请求login_url登录
result = result_async.get()
if result:
request.cookies = result
self.cookiesPool.add_redis_cookies(result)
self.lock.release() # 释放锁
return None
3. 微博Cookie池的使用
以下为使用Python爬取微博,构建Cookies池的具体步骤:
3.1 预登录信息抓取
模拟登陆,首先需要在settings.py
中定义某一用户的个人信息,包括账号名、密码等。新建一个文件get_config.py
,在其中定义一个类,用于获取预登录的信息。预登录信息主要由PC端的主页和GenURL等URL以及参数获得。
from fake_useragent import UserAgent
import requests
class Cookies(object):
def __init__(self, username, password):
self.username = username
self.password = password
self.ua = UserAgent()
def __get_home_html(self):
url = 'https://weibo.com'
headers = {
'User-Agent': self.ua.random
}
response = requests.get(url, headers=headers)
return response.text.encode(response.encoding).decode('unicode_escape')
# 获取st码
def get_st(self):
login_html = self.__get_home_html()
pattern = r'"st":"(.{4,40}?)",'
p = re.compile(pattern)
result = p.findall(login_html)
if len(result) != 0:
return result[0]
else:
return None
3.2 模拟登陆微博
在weibo_loginer.py
文件中,定义一个登录类,利用Requests库,模拟登陆微博。登陆成功后,将Cookies等信息存在Redis数据库中,方便API等后续爬虫的调用。
class WeiboLoginer(object):
def __init__(self, account, password):
self.account = account
self.password = password
self.logger = logging.getLogger(__name__)
# 获取登录用户名
if self.account is None:
self.username = "默认用户"
else:
if "@" in self.account:
self.username = self.account.split("@")[0]
else:
self.username = self.account
def __get_cookies(self, username):
cookie_dict = {}
conn = redis.Redis(host='localhost', port=6379, db=0, password='******')
filter_dict = {'name': 'SUE', 'name': '_T_WM'}
keys = conn.keys()
for key in keys:
cookie = eval(str(conn.get(key), encoding='utf-8'))
is_find = True
for key1, value in filter_dict.itemes():
if key1 not in cookie or value not in cookie[key1]:
is_find = False
if is_find:
cookie_dict = cookie
break
return cookie_dict
def login(self):
nick_name = ''
cookies = ''
# 更新头信息,避免被反爬虫识别
cookie_jar = requests.cookies.RequestsCookieJar()
login_info = {'username': self.account, 'password': self.password, 'savestate': '1', 'ec': '0', 'entry': 'mweibo', 'mainpageflag': '1'}
login_url = 'https://passport.weibo.cn/sso/login'
# 检查是否需要输入验证码
sa_token = ""
st = Cookies(self.account, self.password).get_st()
data = {
"callback": "sinaSSOController.preloginCallBack",
"su": "MTIzNDU2Nzg5MA==",
"rsakt": "mod",
"client": "ssologin.js(v1.4.19)",
"_": 1491221611424
}
r = requests.get('https://login.sina.com.cn/sso/prelogin.php', params=data)
# 登录数据绑定
login_info.update({
'from': '',
'wentry': '',
'loginfrom': '',
'client_id': '',
'code': '',
'qq': '',
'hff': '',
'hfp': ''
})
# 验证码图片识别登陆
temp_cookies = requests.get(url=login_url, data=login_info, headers=self.headers).cookies
json_text = requests.post(url=postUrl, data=postdata, headers=headers, cookies=temp_cookies).json()
cookie_jar.update(temp_cookies)
cookies = requests.utils.dict_from_cookiejar(cookie_jar)
conn = redis.Redis(host='localhost', port=6379, db=0, password='******')
conn.set(self.username, cookies)
return True
3.3 Cookie自动更新
为了避免Cookie池中存在失效的Cookies,需要自动化更新,保证Cookie池中均为可用的Cookies。
from scrapy.utils.project import get_project_settings
# 获取配置文件
settings = get_project_settings()
START_TIME = datetime.now()
class Cookies(object):
def __init__(self, username):
self.username = username
self.logger = logging.getLogger(__name__)
self.client = WeiboLoginer(self.username, settings.get('WEIBO_PASSWORD'))
@staticmethod
def cookie_show():
conn = redis.Redis(host='localhost', port=6379, db=0, password='******')
keys = conn.keys()
for key in keys:
print("%s:%s" % (key, conn.get(key)))
def get_cookies(self):
# 当前时间
current_time = datetime.now()
conn = redis.Redis(host='localhost', port=6379, db=0, password='******')
if not conn.get(self.username) or (current_time - START_TIME).seconds > (60*30):
self.client.login()
START_TIME = current_time
return conn.get(self.username)
3.4 Cookie更新监控责任链
构建Cookies的责任链,进行Cookie的自动更新与监控。
class CookiesMiddleware(object):
def __init__(self, account=None, passwd=None):
self.account = account
self.passwd = passwd
self.logger = logging.getLogger(__name__)
self.logger.info("Printting CookiesMiddleware!!!")
def process_request(self, request, spider):
if 'weibo.cn' in request.url:
self.logger.info("self.account:", self.account)
username = self.account.split("@")[0]
# 1.检查是否存在cookie池
cookies = RedisCookiesPool(redis_conn).get_redis_cookies(username)
self.logger.info("可用Cookies:", cookies)
request.cookies = cookies
if self.__check_request(request):
return self.__check_chian()
return None
def __check_chian(self):
# 1.检查网络,进行重试
check_network = CheckNetworkMiddleware()
# 2.检查IP,进行重试
check_proxy = CheckProxyMiddleware()
# 检查账号是否可以使用
check_login = CheckLoginMiddleware(self.account, self.passwd)
check_network.set_next(check_proxy)
check_proxy.set_next(check_login)
return check_network.handle_request()
def __check_request(self, request):
if request.meta.get('check', False):
return False
if not request.cookies:
return False
return True
3.5 构建Cookie池
最后一步,对构建好的Cookies进行测试,并将其存储至Redis中,构建Cookie池。
def use_cookie_pool():
account = "example@example.com"
passwd = "123456"
cookies_pool = CookiesMiddleware(account, passwd)
cookies = cookies_pool.process_request(Request("http://weibo.cn", meta={'check':True}), WeiboSpider())
print(cookies)
if cookies:
print("login success")
else:
print("login fail")
# 随机返回Cookies
print(cookies_pool.process_request(Request("http://weibo.cn"), WeiboSpider()))
4. 总结
在对Python的爬虫进行开发的时候,深入了解Cookies池的构建机制并加以构建,可以有效解决Cookie失效带来的问题,并减小爬虫被反爬虫机制封锁的风险。对于后续的数据抓取作业,也会大大提高爬虫网络请求的成功率。总之,对于开发Python爬虫工具的开发人员来说,掌握Cookies池的构建技术非常重要。