diff --git a/py/测绘站采集.py b/py/测绘站采集.py index c3b2b68..bcbb1cb 100644 --- a/py/测绘站采集.py +++ b/py/测绘站采集.py @@ -30,159 +30,139 @@ import cv2 from bs4 import BeautifulSoup from urllib.parse import urlparse from translate import Translator # 导入Translator类,用于文本翻译 +# -*- coding: utf-8 -*- +import time +import random +import requests +import re +import os +import cv2 +from datetime import datetime +from bs4 import BeautifulSoup +import base64 +from fake_useragent import UserAgent # 需要先安装:pip install fake-useragent -###################################################################################################################### -# 获取rtp目录下的文件名,组播IP采集 -files = os.listdir('rtp') -files_name = [] -# 去除后缀名并保存至provinces_isps -for file in files: - name, extension = os.path.splitext(file) - files_name.append(name) -#忽略不符合要求的文件名 -provinces_isps = [name for name in files_name if name.count('_') == 1] -print(f"本次查询:{provinces_isps}的组播节目") -keywords = [] -for province_isp in provinces_isps: - # 读取文件并删除空白行 - try: - with open(f'rtp/{province_isp}.txt', 'r', encoding='utf-8') as file: - lines = file.readlines() - lines = [line.strip() for line in lines if line.strip()] - # 获取第二行中以包含 "rtp://" 的值作为 mcast - if lines: - first_line = lines[1] - if "rtp://" in first_line: - mcast = first_line.split("rtp://")[1].split(" ")[0] - keywords.append(province_isp + "_" + mcast) - except FileNotFoundError: - # 如果文件不存在,则捕获 FileNotFoundError 异常并打印提示信息 - print(f"文件 '{province_isp}.txt' 不存在. 跳过此文件.") -requested_urls = set() # 用于记录已经请求过的地址 -parse_count = {} # 用于记录每个 URL 的解析次数 -###################################################################################################################### -# 获取rtp目录下的文件名,组播IP采集 -files = os.listdir('rtp') -files_name = [] -# 去除后缀名并保存至provinces_isps -for file in files: - name, extension = os.path.splitext(file) - files_name.append(name) -#忽略不符合要求的文件名 -provinces_isps = [name for name in files_name if name.count('_') == 1] -print(f"本次查询:{provinces_isps}的组播节目") -keywords = [] -for province_isp in provinces_isps: - # 读取文件并删除空白行 - try: - with open(f'rtp/{province_isp}.txt', 'r', encoding='utf-8') as file: - lines = file.readlines() - lines = [line.strip() for line in lines if line.strip()] - # 获取第二行中以包含 "rtp://" 的值作为 mcast - if lines: - first_line = lines[1] - if "rtp://" in first_line: - mcast = first_line.split("rtp://")[1].split(" ")[0] - keywords.append(province_isp + "_" + mcast) - except FileNotFoundError: - # 如果文件不存在,则捕获 FileNotFoundError 异常并打印提示信息 - print(f"文件 '{province_isp}.txt' 不存在. 跳过此文件.") -requested_urls = set() # 用于记录已经请求过的地址 -parse_count = {} # 用于记录每个 URL 的解析次数 -for keyword in keywords: - province, isp, mcast = keyword.split("_") - # 将省份转成英文小写 - # 根据不同的isp设置不同的org值 - if province == "北京" and isp == "联通": - isp_en = "cucc" - org = "China Unicom Beijing Province NeTwork" - elif isp == "联通": - isp_en = "cucc" - org = "CHINA UNICOM China169 Backbone" - elif isp == "电信": - org = "Chinanet" - isp_en = "ctcc" - elif isp == "移动": - org = "China Mobile communications corporation" - isp_en = "cmcc" +# 创建输出目录 +os.makedirs('playlist', exist_ok=True) - current_time = datetime.now() - timeout_cnt = 0 - result_urls = set() - should_continue_while = True - while should_continue_while and len(result_urls) == 0 and timeout_cnt <= 2: +# 配置参数 +DELAY_RANGE = (3, 6) # 随机延迟时间范围(秒) +MAX_RETRIES = 3 # 最大重试次数 +REQUEST_TIMEOUT = 10 # 请求超时时间(秒) + +def get_random_header(): + """生成随机请求头""" + return { + 'User-Agent': UserAgent().random, + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Referer': 'https://fofa.info/' + } + +def safe_request(url): + """带重试机制的请求函数""" + for attempt in range(MAX_RETRIES): try: - search_url = 'https://fofa.info/result?qbase64=' - search_txt = f'"udpxy" && country="CN" && region="{province}"' # && org="{org}" - # 将字符串编码为字节流 - bytes_string = search_txt.encode('utf-8') - # 使用base64进行编码 - search_txt = base64.b64encode(bytes_string).decode('utf-8') - search_url += search_txt - if search_url not in requested_urls: # 仅当地址未被请求过时才进行请求 - print(f"{current_time} 查询运营商 : {province}{isp},查询网址 : {search_url}") - response = requests.get(search_url, timeout=5) - # 处理响应 - response.raise_for_status() - # 检查请求是否成功 - html_content = response.text - requested_urls.add(search_url) # 将请求过的地址添加到记录集合中 + # 随机延迟防止被封 + time.sleep(random.uniform(*DELAY_RANGE)) + + response = requests.get( + url, + headers=get_random_header(), + timeout=REQUEST_TIMEOUT + ) + + # 检查HTTP状态码 + if response.status_code == 429: + wait_time = 30 # 遇到反爬等待30秒 + print(f"遇到反爬机制,等待{wait_time}秒后重试") + time.sleep(wait_time) + continue + + response.raise_for_status() + return response.text + + except Exception as e: + print(f"请求失败(第{attempt+1}次重试): {str(e)}") + if attempt == MAX_RETRIES - 1: + raise - if search_url not in parse_count: - parse_count[search_url] = 0 - if parse_count[search_url] < 2: - # 使用BeautifulSoup解析网页内容 - html_soup = BeautifulSoup(html_content, "html.parser") - # print(f"{current_time} html_content:{html_content}") - # 查找所有符合指定格式的网址 - # 设置匹配的格式,如http://8.8.8.8:8888 - pattern = r"http://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+" - urls_all = re.findall(pattern, html_content) - # 去重得到唯一的URL列表 - result_urls = set(urls_all) - print(f"{current_time} result_urls:{result_urls}") - parse_count[search_url] += 1 - else: - print(f"{current_time} 已达到对 {search_url} 的最大解析次数(2次)") - should_continue_while = False # 当达到最大解析次数,修改循环条件 - except (requests.Timeout, requests.RequestException) as e: - timeout_cnt += 1 - print(f"{current_time} [{province}]搜索请求发生超时,异常次数:{timeout_cnt}") - if timeout_cnt > 2: - print(f"{current_time} 搜索IPTV频道源[{province}{isp}],超时次数过多:{timeout_cnt} 次,停止处理") - break - - valid_ips = [] - # 遍历所有视频链接 - for url in result_urls: - video_url = url + "/rtp/" + mcast - # 用OpenCV读取视频 - cap = cv2.VideoCapture(video_url) - # 检查视频是否成功打开 - if not cap.isOpened(): - print(f"{current_time} {video_url} 无效") - else: - # 读取视频的宽度和高度 +def validate_video(url, mcast): + """验证视频流有效性""" + video_url = f"{url}/rtp/{mcast}" + print(f"正在验证: {video_url}") + + try: + # 设置超时参数 + cap = cv2.VideoCapture(video_url, cv2.CAP_FFMPEG) + cap.set(cv2.CAP_PROP_TIMEOUT, 5000) # 5秒超时 + + if cap.isOpened(): width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - print(f"{current_time} {video_url} 的分辨率为 {width}x{height}") - # 检查分辨率是否大于0 - if width > 0 and height > 0: - valid_ips.append(url) - # 关闭视频流 cap.release() + return width > 0 and height > 0 + return False + + except Exception as e: + print(f"视频验证异常: {str(e)}") + return False + +def main(): + # 获取需要处理的文件列表 + files = [f.split('.')[0] for f in os.listdir('rtp') if f.endswith('.txt')] + print(f"待处理频道列表: {files}") + + for filename in files: + province_isp = filename.split('_') + if len(province_isp) != 2: + continue + + province, isp = province_isp + print(f"\n正在处理: {province}{isp}") + + # 读取组播地址 + try: + with open(f'rtp/{filename}.txt', 'r', encoding='utf-8') as f: + mcast = f.readline().split('rtp://')[1].split()[0].strip() + except Exception as e: + print(f"文件读取失败: {str(e)}") + continue + + # 构造搜索请求 + search_txt = f'"udpxy" && country="CN" && region="{province}"' + encoded_query = base64.b64encode(search_txt.encode()).decode() + search_url = f'https://fofa.info/result?qbase64={encoded_query}' + + # 执行搜索 + try: + html = safe_request(search_url) + except Exception as e: + print(f"搜索失败: {str(e)}") + continue + + # 解析搜索结果 + soup = BeautifulSoup(html, 'html.parser') + pattern = re.compile(r"http://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+") + found_urls = set(pattern.findall(html)) + print(f"找到{len(found_urls)}个有效地址") + + # 验证地址有效性 + valid_urls = [url for url in found_urls if validate_video(url, mcast)] + print(f"验证通过{len(valid_urls)}个有效地址") + + # 生成播放列表 + if valid_urls: + output_file = f'playlist/{province}{isp}.txt' + with open(f'rtp/{filename}.txt', 'r') as src, open(output_file, 'w') as dst: + original_content = src.read() + for url in valid_urls: + modified = original_content.replace('rtp://', f'{url}/rtp/') + dst.write(modified + '\n') + print(f"已生成播放列表: {output_file}") + +if __name__ == '__main__': + main() - if valid_ips: - # 生成节目列表 省份运营商.txt - rtp_filename = f'rtp/{province}_{isp}.txt' - txt_filename = f'playlist/{province}{isp}.txt' - with open(rtp_filename, 'r', encoding='utf-8') as file: - data = file.read() - with open(txt_filename, 'a') as new_file: # 以追加形式写入 - for url in valid_ips: - new_data = data.replace("rtp://", f"{url}/rtp/") - new_file.write(new_data) - print(f'已生成播放列表,保存至{txt_filename}') print('对playlist文件夹里面的所有txt文件进行去重处理') def remove_duplicates_keep_order(folder_path):