Update 测绘站采集.py

This commit is contained in:
frxz751113
2025-05-03 09:20:35 +08:00
committed by GitHub
parent d4d835a1f3
commit ce1d5f5f9f
+50 -3
View File
@@ -47,6 +47,11 @@ os.makedirs('playlist', exist_ok=True)
DELAY_RANGE = (3, 6) # 随机延迟时间范围(秒) DELAY_RANGE = (3, 6) # 随机延迟时间范围(秒)
MAX_RETRIES = 3 # 最大重试次数 MAX_RETRIES = 3 # 最大重试次数
REQUEST_TIMEOUT = 10 # 请求超时时间(秒) REQUEST_TIMEOUT = 10 # 请求超时时间(秒)
PROXY_REFRESH_INTERVAL = 300 # 代理刷新间隔(秒)
# 代理池相关变量
proxies = []
last_refresh_time = 0
def get_random_header(): def get_random_header():
"""生成随机请求头""" """生成随机请求头"""
@@ -56,16 +61,59 @@ def get_random_header():
'Referer': 'https://fofa.info/' 'Referer': 'https://fofa.info/'
} }
def scrape_proxies_89ip(url):
"""从 89ip.cn 抓取代理列表"""
try:
response = requests.get(url)
response.raise_for_status()
# 89ip.cn 返回的代理列表是纯文本,每行一个代理
proxies_text = response.text.split('提取结果')[1].strip()
# 过滤掉空行和无效的代理
valid_proxies = [proxy.strip() for proxy in proxies_text.splitlines() if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$', proxy.strip())]
# 默认所有代理为 HTTP 协议
return [f"http://{proxy}" for proxy in valid_proxies]
except Exception as e:
print(f"抓取代理失败: {str(e)}")
return []
def get_proxies():
"""获取代理列表,定期刷新"""
global proxies, last_refresh_time
current_time = time.time()
# 如果代理列表为空或超过刷新间隔,重新抓取代理
if not proxies or current_time - last_refresh_time > PROXY_REFRESH_INTERVAL:
print("正在刷新代理列表...")
proxy_url = "https://www.89ip.cn/tqdl.html?num=60&address=&kill_address=&port=&kill_port=&isp="
proxies = scrape_proxies_89ip(proxy_url)
last_refresh_time = current_time
if not proxies:
raise Exception("无法获取代理")
return proxies
def safe_request(url): def safe_request(url):
"""带重试机制的请求函数""" """带重试机制和代理的请求函数"""
for attempt in range(MAX_RETRIES): for attempt in range(MAX_RETRIES):
try: try:
# 随机延迟防止被封 # 随机延迟防止被封
time.sleep(random.uniform(*DELAY_RANGE)) time.sleep(random.uniform(*DELAY_RANGE))
# 获取代理列表
proxy_list = get_proxies()
# 随机选择一个代理
proxy = random.choice(proxy_list)
print(f"使用代理: {proxy}")
response = requests.get( response = requests.get(
url, url,
headers=get_random_header(), headers=get_random_header(),
proxies={"http": proxy, "https": proxy},
timeout=REQUEST_TIMEOUT timeout=REQUEST_TIMEOUT
) )
@@ -127,7 +175,7 @@ def main():
continue continue
# 构造搜索请求 # 构造搜索请求
search_txt = f'"udpxy" && country="CN" && region="{province}' search_txt = f'"udpxy" && country="CN" && region="{province}"'
encoded_query = base64.b64encode(search_txt.encode()).decode() encoded_query = base64.b64encode(search_txt.encode()).decode()
search_url = f'https://fofa.info/result?qbase64={encoded_query}' search_url = f'https://fofa.info/result?qbase64={encoded_query}'
@@ -161,7 +209,6 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() main()
print('对playlist文件夹里面的所有txt文件进行去重处理') print('对playlist文件夹里面的所有txt文件进行去重处理')
def remove_duplicates_keep_order(folder_path): def remove_duplicates_keep_order(folder_path):
for filename in os.listdir(folder_path): for filename in os.listdir(folder_path):