163 lines
6.1 KiB
Python
163 lines
6.1 KiB
Python
"""
|
|
Anti-bot protection module
|
|
Implements various techniques to avoid detection by anti-crawling systems
|
|
"""
|
|
|
|
import random
|
|
import logging
|
|
from typing import Optional, Dict, Any, List
|
|
import httpx
|
|
|
|
from app.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Predefined User-Agent list for rotation
|
|
USER_AGENTS = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
]
|
|
|
|
|
|
class AntiBotProtection:
|
|
"""Anti-bot protection service"""
|
|
|
|
def __init__(self):
|
|
self.proxy_pool_url = settings.PROXY_POOL_URL
|
|
self.random_delay_min = settings.RANDOM_DELAY_MIN
|
|
self.random_delay_max = settings.RANDOM_DELAY_MAX
|
|
|
|
def get_random_delay(self) -> float:
|
|
"""
|
|
Generate random delay within configured range.
|
|
Returns delay in seconds.
|
|
|
|
Validates: Requirements 7.1
|
|
"""
|
|
delay = random.uniform(self.random_delay_min, self.random_delay_max)
|
|
logger.debug(f"Generated random delay: {delay:.2f}s")
|
|
return delay
|
|
|
|
def get_random_user_agent(self) -> str:
|
|
"""
|
|
Select random User-Agent from predefined list.
|
|
Returns User-Agent string.
|
|
|
|
Validates: Requirements 7.2
|
|
"""
|
|
user_agent = random.choice(USER_AGENTS)
|
|
logger.debug(f"Selected User-Agent: {user_agent[:50]}...")
|
|
return user_agent
|
|
|
|
async def get_proxy(self) -> Optional[Dict[str, str]]:
|
|
"""
|
|
Get proxy from proxy pool service.
|
|
Returns proxy dict or None if unavailable.
|
|
Falls back to direct connection if proxy pool is unavailable.
|
|
|
|
Validates: Requirements 7.3, 7.4
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
response = await client.get(f"{self.proxy_pool_url}/get")
|
|
|
|
if response.status_code == 200:
|
|
proxy_info = response.json()
|
|
proxy_url = proxy_info.get("proxy")
|
|
|
|
if proxy_url:
|
|
proxy_dict = {
|
|
"http://": f"http://{proxy_url}",
|
|
"https://": f"https://{proxy_url}"
|
|
}
|
|
logger.info(f"Obtained proxy: {proxy_url}")
|
|
return proxy_dict
|
|
else:
|
|
logger.warning("Proxy pool returned empty proxy")
|
|
return None
|
|
else:
|
|
logger.warning(f"Proxy pool returned status {response.status_code}")
|
|
return None
|
|
|
|
except httpx.RequestError as e:
|
|
logger.warning(f"Proxy pool service unavailable: {e}, falling back to direct connection")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting proxy: {e}")
|
|
return None
|
|
|
|
def build_headers(self, user_agent: Optional[str] = None) -> Dict[str, str]:
|
|
"""
|
|
Build HTTP headers with random User-Agent and common headers.
|
|
|
|
Args:
|
|
user_agent: Optional custom User-Agent, otherwise random one is selected
|
|
|
|
Returns:
|
|
Dict of HTTP headers
|
|
"""
|
|
if user_agent is None:
|
|
user_agent = self.get_random_user_agent()
|
|
|
|
headers = {
|
|
"User-Agent": user_agent,
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Referer": "https://weibo.com/",
|
|
"Sec-Fetch-Dest": "empty",
|
|
"Sec-Fetch-Mode": "cors",
|
|
"Sec-Fetch-Site": "same-origin",
|
|
}
|
|
|
|
return headers
|
|
|
|
def get_fingerprint_data(self) -> Dict[str, Any]:
|
|
"""
|
|
Generate browser fingerprint data for simulation.
|
|
|
|
Returns:
|
|
Dict containing fingerprint information
|
|
"""
|
|
screen_resolutions = [
|
|
"1920x1080", "1366x768", "1440x900", "1536x864",
|
|
"1280x720", "2560x1440", "3840x2160"
|
|
]
|
|
|
|
timezones = [
|
|
"Asia/Shanghai", "Asia/Beijing", "Asia/Hong_Kong",
|
|
"Asia/Taipei", "Asia/Singapore"
|
|
]
|
|
|
|
languages = [
|
|
"zh-CN", "zh-CN,zh;q=0.9", "zh-CN,zh;q=0.9,en;q=0.8",
|
|
"zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7"
|
|
]
|
|
|
|
fingerprint = {
|
|
"screen_resolution": random.choice(screen_resolutions),
|
|
"timezone": random.choice(timezones),
|
|
"language": random.choice(languages),
|
|
"color_depth": random.choice([24, 32]),
|
|
"platform": random.choice(["Win32", "MacIntel", "Linux x86_64"]),
|
|
"hardware_concurrency": random.choice([4, 8, 12, 16]),
|
|
"device_memory": random.choice([4, 8, 16, 32]),
|
|
}
|
|
|
|
logger.debug(f"Generated fingerprint: {fingerprint}")
|
|
return fingerprint
|
|
|
|
|
|
# Global instance
|
|
antibot = AntiBotProtection()
|