Add files via upload

2024-08-12 09:52:53 +08:00
parent 10da4dbdf7
commit da6cc4c035
1 changed files with 255 additions and 0 deletions
@@ -0,0 +1,255 @@
+#本程序只对酒店源进行了720p以上分辨率过滤，IP段去重。组播和自定义源请自行从源文件过滤
+import time
+import concurrent.futures
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+import requests
+import re
+import os
+import threading
+from queue import Queue
+import queue
+from datetime import datetime
+import replace
+import fileinput
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from pypinyin import lazy_pinyin
+from opencc import OpenCC
+import base64
+import cv2
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from translate import Translator  # 导入Translator类，用于文本翻译
+
+
+
+
+
+
+##########################################################IP段去重,保留最后一个IP段，防止高峰拥堵，也减少不必要的检测行
+
+def deduplicate_lines(input_file_path, output_file_path):
+    seen_combinations = {}
+    unique_lines = []
+    with open(input_file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            # 使用正则表达式查找行中的所有URL，并捕获IP地址、端口号和端口号之后的部分
+            urls = re.findall(r'http://([\d.]+):(\d+)(/.*)?', line)
+            # 为每个URL生成一个去重键
+            for full_url in urls:
+                ip, port, path = full_url
+                ip_parts = ip.split('.')
+                if len(ip_parts) < 3:
+                    continue
+                # 使用IP的前三个字段和端口号之后的部分生成去重键
+                combination_key = f"{ip_parts[0]}.{ip_parts[1]}.{ip_parts[2]}-{port}-{path or ''}"
+                # 检查这个组合是否已经出现过
+                if combination_key not in seen_combinations:
+                    # 如果没有出现过，记录当前行和去重键
+                    seen_combinations[combination_key] = line.strip()
+                else:
+                    # 如果已经出现过，更新为最后出现的行
+                    seen_combinations[combination_key] = line.strip()
+    # 将去重后的所有唯一行写入新文件
+    with open(output_file_path, 'w', encoding='utf-8') as file:
+        for line in seen_combinations.values():
+            file.write(line + '\n')
+print(f"IP段去重完成")            
+# 调用函数
+input_file_path = '酒店源.txt'
+output_file_path = '酒店源.txt'
+deduplicate_lines(input_file_path, output_file_path)
+################################################################################
+
+
+#################################################### 对整理好的频道列表测试HTTP连接
+# 函数：获取视频分辨率
+def get_video_resolution(video_path, timeout=1):
+    # 使用OpenCV创建视频捕获对象
+    cap = cv2.VideoCapture(video_path)
+    # 检查视频是否成功打开
+    if not cap.isOpened():
+        return None
+    # 获取视频的宽度和高度
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # 释放视频捕获对象
+    cap.release()
+    # 返回视频的分辨率
+    return (width, height)
+
+# 函数：处理每一行
+def process_line(line, output_file, order_list, valid_count, invalid_count, total_lines):
+    # 去除行尾的空白字符并按逗号分割行
+    parts = line.strip().split(',')
+    # 如果行包含特定的标签'#genre#'，则直接写入新文件
+    if '#genre#' in line:
+        with threading.Lock():  # 使用线程锁保证写入操作的原子性
+            output_file.write(line)
+            print(f"已写入genre行：{line.strip()}")
+    # 如果分割后的部分数量为2，则继续处理
+    elif len(parts) == 2:
+        channel_name, channel_url = parts
+        # 获取视频的分辨率
+        resolution = get_video_resolution(channel_url, timeout=2)
+        # 如果分辨率有效且高度大于等于720p
+        if resolution and resolution[1] >= 720:
+            with threading.Lock():  # 使用线程锁
+                output_file.write(f"{channel_name}[{resolution[1]}p],{channel_url}\n")
+                # 将频道名、分辨率和URL添加到列表中
+                order_list.append((channel_name, resolution[1], channel_url))
+                # 有效计数增加
+                valid_count[0] += 1
+                print(f"Channel '{channel_name}' accepted with resolution {resolution[1]}p at URL {channel_url}.")
+        else:
+            # 如果分辨率不满足条件，无效计数增加
+            invalid_count[0] += 1
+    # 打印当前处理进度
+    with threading.Lock():
+        print(f"有效: {valid_count[0]}, 无效: {invalid_count[0]}, 总数: {total_lines}, 进度: {(valid_count[0] + invalid_count[0]) / total_lines * 100:.2f}%")
+
+# 函数：多线程工作
+def worker(task_queue, output_file, order_list, valid_count, invalid_count, total_lines):
+    # 循环直到队列为空
+    while True:
+        try:
+            # 从队列中获取任务，超时时间为1秒
+            line = task_queue.get(timeout=0.5)
+            # 处理获取的任务
+            process_line(line, output_file, order_list, valid_count, invalid_count, total_lines)
+        except Queue.Empty:  # 如果队列为空，捕获异常
+            break
+        finally:
+            # 标记任务已完成
+            task_queue.task_done()
+
+# 主函数
+def main(source_file_path, output_file_path):
+    # 初始化列表和计数器
+    order_list = []
+    valid_count = [0]
+    invalid_count = [0]
+    task_queue = Queue()
+    # 使用with语句打开源文件并读取所有行
+    with open(source_file_path, 'r', encoding='utf-8') as source_file:
+        lines = source_file.readlines()
+    # 使用with语句打开输出文件准备写入
+    with open(output_file_path, 'w', encoding='utf-8') as output_file:
+        # 创建线程池，最大工作线程数为64
+        with ThreadPoolExecutor(max_workers=64) as executor:
+            # 为线程池中的每个线程提交worker函数
+            for _ in range(64):
+                executor.submit(worker, task_queue, output_file, order_list, valid_count, invalid_count, len(lines))
+            # 将所有行放入任务队列
+            for line in lines:
+                task_queue.put(line)
+            # 等待队列中的所有任务完成
+            task_queue.join()
+    # 打印任务完成的统计信息
+    print(f"任务完成，有效频道数：{valid_count[0]}, 无效频道数：{invalid_count[0]}, 总频道数：{len(lines)}")
+
+# 程序入口点
+if __name__ == "__main__":
+    # 定义源文件和输出文件的路径
+    source_file_path = '酒店源.txt'  # 替换为你的源文件路径
+    output_file_path = '检测结果.txt'  # 替换为你的输出文件路径
+    # 调用主函数
+    main(source_file_path, output_file_path)
+
+
+
+
+###############################################################################文本排序
+# 打开原始文件读取内容，并写入新文件
+with open('检测结果.txt', 'r', encoding='utf-8') as file:
+    lines = file.readlines()
+# 定义一个函数，用于提取每行的第一个数字
+def extract_first_number(line):
+    match = re.search(r'\d+', line)
+    return int(match.group()) if match else float('inf')
+# 对列表中的行进行排序
+# 按照第一个数字的大小排列，如果不存在数字则按中文拼音排序
+sorted_lines = sorted(lines, key=lambda x: (not 'CCTV' in x, extract_first_number(x) if 'CCTV' in x else lazy_pinyin(x.strip())))
+# 将排序后的行写入新的utf-8编码的文本文件，文件名基于原文件名
+output_file_path = "sorted_" + os.path.basename(file_path)
+# 写入新文件
+with open('酒店源.txt', "w", encoding="utf-8") as file:
+    for line in sorted_lines:
+        file.write(line)
+print(f"文件已排序并保存为新文件")
+
+
+########################################################################定义关键词分割规则,分类提取
+def check_and_write_file(input_file, output_file, keywords):
+    # 使用 split(', ') 而不是 split(',') 来分割关键词
+    keywords_list = keywords.split(', ')
+    first_keyword = keywords_list[0]  # 获取第一个关键词作为头部信息
+    pattern = '|'.join(re.escape(keyword) for keyword in keywords_list)
+    extracted_lines = False
+    with open(input_file, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    with open(output_file, 'w', encoding='utf-8') as out_file:
+        out_file.write(f'{first_keyword},#genre#\n')  # 使用第一个关键词作为头部信息
+        for line in lines:
+            if 'genre' not in line and 'epg' not in line:
+                if re.search(pattern, line):
+                    out_file.write(line)
+                    extracted_lines = True
+    # 如果没有提取到任何关键词，则不保留输出文件
+    if not extracted_lines:
+        os.remove(output_file)  # 删除空的输出文件        
+        print(f"未提取到关键词，{output_file} 已被删除。")
+    else:
+        print(f"文件已提取关键词并保存为: {output_file}")
+
+
+# 按类别提取关键词并写入文件
+check_and_write_file('酒店源.txt',  'a.txt',  keywords="央视频道, CCTV")
+check_and_write_file('酒店源.txt',  'b.txt',  keywords="卫视频道, 卫视")
+check_and_write_file('酒店源.txt',  'c.txt',  keywords="影视频道, 影, 剧, 大片")
+check_and_write_file('酒店源.txt',  'e.txt',  keywords="港澳频道, TVB, 珠江台, 澳门, 龙华, 广场舞, 动物杂技, 民视, 中视, 华视, AXN, MOMO, 采昌, 耀才, 靖天, 镜新闻, 靖洋, 莲花, 年代, 爱尔达, 好莱坞, 华丽, 非凡, 公视, \
+寰宇, 无线, EVEN, MoMo, 爆谷, 面包, momo, 唐人, 中华小, 三立, CNA, FOX, RTHK, Movie, 八大, 中天, 中视, 东森, 凤凰, 天映, 美亚, 环球, 翡翠, 亚洲, 大爱, 大愛, 明珠, 半岛, AMC, 龙祥, 台视, 1905, 纬来, 神话, 经典都市, 视界, \
+番薯, 私人, 酒店, TVB, 凤凰, 半岛, 星光视界, 大愛, 新加坡, 星河, 明珠, 环球, 翡翠台")
+
+
+#对生成的文件进行合并
+file_contents = []
+file_paths = ["e.txt", "a.txt", "b.txt", "c.txt"]  # 替换为实际的文件路径列表
+for file_path in file_paths:
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding="utf-8") as file:
+            content = file.read()
+            file_contents.append(content)
+    else:                # 如果文件不存在，则提示异常并打印提示信息
+        print(f"文件 {file_path} 不存在，跳过")
+# 写入合并后的文件
+with open("去重.txt", "w", encoding="utf-8") as output:
+    output.write('\n'.join(file_contents))
+
+
+##################################################################### 打开文档并读取所有行 ，对提取后重复的频道去重
+with open('去重.txt', 'r', encoding="utf-8") as file:
+ lines = file.readlines()
+# 使用列表来存储唯一的行的顺序 
+ unique_lines = [] 
+ seen_lines = set() 
+# 遍历每一行，如果是新的就加入unique_lines 
+for line in lines:
+ if line not in seen_lines:
+  unique_lines.append(line)
+  seen_lines.add(line)
+# 将唯一的行写入新的文档 
+with open('酒店源.txt', 'w', encoding="utf-8") as file:
+ file.writelines(unique_lines)
+
+
+#任务结束，删除不必要的过程文件
+files_to_remove = ['去重.txt', "2.txt", "iptv.txt", "iptv1.txt", "e.txt", "a.txt", "b.txt", "c.txt", "检测结果.txt"]
+for file in files_to_remove:
+    if os.path.exists(file):
+        os.remove(file)
+    else:              # 如果文件不存在，则提示异常并打印提示信息
+        print(f"文件 {file} 不存在，跳过删除。")
+print("任务运行完毕，酒店源频道列表可查看文件夹内txt文件！")
+