最近在爬一些网站,总是遇到Cloudflare 5s盾网站安全检测机制,就像这样:

于是乎,就写了个自动化程序,用于自动绕过5s盾网站安全检测机制,从而获得目标网站的页面源码

cloudflare5s.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import asyncio
import traceback
import platform
from DrissionPage import ChromiumPage, ChromiumOptions
from loguru import logger

ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"


class Cloudflare5sBypass(object):
driver = None

def __init__(self, proxy_server=None):
browser_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
options = ChromiumOptions()
options.set_paths(browser_path=browser_path)
options.set_user_agent(ua)
arguments = [
"--accept-lang=en-US", # 设置浏览器的语言偏好为英语(美国)
"--no-first-run", # 禁止首次运行时的欢迎页面和设置向导
"--force-color-profile=srgb", # 强制使用 sRGB 颜色配置文件
"--metrics-recording-only", # 仅记录浏览器使用数据,不发送到服务器
"--password-store=basic", # 使用基本的密码存储机制
"--use-mock-keychain", # 使用模拟的密钥链(在测试环境中使用,不实际写入系统密钥链)
"--export-tagged-pdf", # 导出带标签的 PDF(可能是针对特定功能的支持
"--no-default-browser-check", # 禁止默认浏览器检查
"--enable-features=NetworkService,NetworkServiceInProcess,LoadCryptoTokenExtension,PermuteTLSExtensions",
# 启用特定的浏览器功能,包括网络服务、加密令牌扩展等
"--disable-gpu", # 禁用 GPU 硬件加速
"--disable-infobars", # 关闭菜单栏
# "--disable-extensions", # 禁用浏览器扩展
"--disable-popup-blocking", # 禁用弹出窗口阻止
"--disable-background-mode", # 禁用后台模式
"--disable-features=FlashDeprecationWarning,EnablePasswordsAccountStorage,PrivacySandboxSettings4",
# 禁用特定的浏览器功能,如Flash弃用警告、密码帐户存储启用等
"--deny-permission-prompts", # 拒绝权限提示
"--disable-suggestions-ui", # 禁用建议界面
"--hide-crash-restore-bubble", # 隐藏崩溃恢复提示气泡
"--window-size=800,600", # 设置浏览器窗口大小为800x600像素
"--disable-mobile-emulation" # 禁用移动设备仿真
]
_platform = platform.system().lower()
if bool(_platform == "linux"):
options.headless(True)
arguments.append("--no-sandbox")

if proxy_server:
options.set_proxy(proxy_server)

for argument in arguments:
options.set_argument(argument)

self.driver = ChromiumPage(addr_or_opts=options)

async def get_web_content(self, _url):
self.driver.set.cookies.clear()
web_content = None
tab_id = self.driver.new_tab(_url).tab_id
tab = self.driver.get_tab(tab_id)
try:
await asyncio.sleep(3)
for i in range(5):
logger.debug("正在检测是否人机验证页面... ", tab.title)
web_content = await self.bypass(tab)
if web_content:
break
await asyncio.sleep(3)
if not web_content:
logger.info("默认返回页面源码")
web_content = tab.html
except:
logger.error(traceback.format_exc())
finally:
tab.close()
return web_content

async def bypass(self, _tab):
# ele_flag = "#turnstile-wrapper"
ele_flag = ".spacer"
logger.debug("-------------debug1-------------")
if _tab.wait.ele_displayed(ele_flag, timeout=1.5):
logger.debug("-------------debug2-------------")
verify_element = _tab.ele(ele_flag, timeout=2.5)
logger.debug("-------------debug3-------------")
if verify_element:
await asyncio.sleep(5) # 等待验证框的加载,受网速影响
logger.debug("-------------debug4-------------")
verify_element.click()
logger.debug("点击了人机验证按钮")
await asyncio.sleep(5) # 等待过人机验证后网页的加载,受网速影响

for line in _tab.cookies():
if line["name"] == "cf_clearance":
logger.success("成功绕过cloudflare5s盾,返回页面源码!")
return _tab.html
return None

test.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import asyncio

from cloudflare5s import Cloudflare5sBypass

proxy_server = None
# proxy_server = "http://127.0.0.1:443"
cloudflare5s = Cloudflare5sBypass(proxy_server)
# url = "https://www.discuss.com.hk/forumdisplay.php?fid=767"
url = "https://nopecha.com/demo/cloudflare"
# url = 'https://chatgpt.com'
web_content = asyncio.run(cloudflare5s.get_web_content(url))
print(web_content)

# cloudflare5s.driver.quit()