Delete py/GAT.py

2024-09-30 03:12:19 +08:00
parent 94ade98939
commit 1e78a1a869
1 changed files with 0 additions and 273 deletions
@@ -1,273 +0,0 @@
 import requests
 from bs4 import BeautifulSoup
 import re
 import os
 from opencc import OpenCC
 from tqdm import tqdm
 import cv2
 import threading
 from queue import Queue
 from concurrent.futures import ThreadPoolExecutor
 import time
 keywords = ['凤凰卫视', '人间卫视', '香港卫视', '翡翠', '凤凰香港', '凤凰中文', '凤凰资讯', 'AXNHD', 'AMC电影', '电影台', '大爱', '东森', 'MTV', '好莱坞', '纬来', '天映', '八大', 
            '华视', '中天', '天良', '美亚', '无线', '影剧', '戏剧台', '靖天', '民视', '三立', '影视2', '综艺', '影迷', '台视', '华视', 
            '影视3', '中视', '豬哥亮', 'TVB', '公视', '寰宇', '靖天', '靖洋', '龙华', '龙祥', '猪哥亮', 
            '中华小当家', '中天娱乐', '动漫', '动物星球', '动画台', '壹新闻', '大立电视', '天良', '探案', '超人', '番薯']  # 这里定义你的搜索关键词列表
 output_file = 'gat.txt'
 keywords = ['凤凰卫视', '人间卫视', '香港卫视']  # 这里定义你的搜索关键词列表
 output_file = 'gat.txt'
 with open(output_file, 'w', encoding='utf-8') as f:
    for keyword in keywords:
        url = f'http://tonkiang.us/?&iqtv={keyword}'
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text_content = soup.get_text()
            f.write(text_content + '\n')
        else:
            print(f'请求 {url} 失败，状态码：{response.status_code}')
        time.sleep(2)  # 添加 1 秒的延迟
 with open('gat.txt', 'r', encoding='utf-8') as infile:
    lines = infile.readlines()
 new_lines = []
 for i in range(len(lines)):
    line = lines[i]
    if 'http' in line:
        # 找到当前行之前的非空行作为频道名称
        for j in range(i - 1, -1, -1):
            if lines[j].strip():
                channel_name = lines[j].strip()
                break
        channel_url = line.strip()
        new_lines.append(f'{channel_name},{channel_url}\n')
 with open('gat.txt', 'w', encoding='utf-8') as outfile:
    outfile.writelines(new_lines)
 def remove_duplicates(input_file, output_file):
    # 用于存储已经遇到的URL和包含genre的行
    seen_urls = set()
    seen_lines_with_genre = set()
    # 用于存储最终输出的行
    output_lines = []
    # 打开输入文件并读取所有行
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        print("去重前的行数：", len(lines))
        # 遍历每一行
        for line in lines:
            # 使用正则表达式查找URL和包含genre的行,默认最后一行
            urls = re.findall(r'://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
            genre_line = re.search(r'\bgenre\b', line, re.IGNORECASE) is not None
            # 如果找到URL并且该URL尚未被记录
            if urls and urls[0] not in seen_urls:
                seen_urls.add(urls[0])
                output_lines.append(line)
            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
            if genre_line:
                output_lines.append(line)
    # 将结果写入输出文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(output_lines)
    print("去重后的行数：", len(output_lines))
 # 使用方法
 remove_duplicates('gat.txt', 'gat.txt')
 print("处理完成，去重完成")
 ################简体转繁体
 # 创建一个OpenCC对象，指定转换的规则为繁体字转简体字
 converter = OpenCC('t2s.json')#繁转简
 #converter = OpenCC('s2t.json')#简转繁
 # 打开txt文件
 with open('gat.txt', 'r', encoding='utf-8') as file:
    traditional_text = file.read()
 # 进行繁体字转简体字的转换
 simplified_text = converter.convert(traditional_text)
 # 将转换后的简体字写入txt文件
 with open('gat.txt', 'w', encoding='utf-8') as file:
    file.write(simplified_text)
 print("处理完成，繁体转换完成")
 ######################################################################################提取goodiptv
 import re
 import os
 # 定义一个包含所有要排除的关键词的列表
 excluded_keywords = ['zhoujie218', 'service', '112114', 'xfjcHD', 'stream8.jlntv', 'live.cooltv', 'P2P', 'tsfile', 'P3p', 'cookies', '9930/qilu', 'gitcode.net', 'Classic天']
 #, ', '', ', '', ', '', ', '', ', '', ', '', ''
 # 定义一个包含所有要提取的关键词的列表
 extract_keywords = [',']
 # 读取文件并处理每一行
 with open('gat.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    # 创建或打开一个输出文件用于写入处理后的数据
    with open('gat.txt', 'w', encoding='utf-8') as outfile:
        for line in lines:
            # 首先检查行是否包含任何提取关键词
            if any(keyword in line for keyword in extract_keywords):
                # 如果包含提取关键词，进一步检查行是否不包含任何排除关键词
                if not any(keyword in line for keyword in excluded_keywords):
                    outfile.write(line)  # 写入符合条件的行到文件
 def filter_lines(file_path):
    with open('gat.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
    filtered_lines = []
    for line in lines:
        if ',' in line:
         if 'epg' not in line and 'mitv' not in line and 'udp' not in line and 'rtp' not in line and 'Classic天' not in line \
            and 'P2p' not in line and 'p2p' not in line and 'p3p' not in line and 'P2P' not in line and 'P3p' not in line and 'P3P' not in line:
          filtered_lines.append(line)
    return filtered_lines
 def write_filtered_lines(output_file_path, filtered_lines):
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.writelines(filtered_lines)
 if __name__ == "__main__":
    input_file_path = 'gat.txt'
    output_file_path = "gat.txt"
    filtered_lines = filter_lines(input_file_path)
    write_filtered_lines(output_file_path, filtered_lines)
 print("/" * 80)
 # ###########################################定义替换规则的字典,对整行内的内容进行替换
 replacements = {
        " ": "",
 }
 # 打开原始文件读取内容，并写入新文件
 with open('gat.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
 # 创建新文件并写入替换后的内容
 with open('gat.txt', 'w', encoding='utf-8') as new_file:
    for line in lines:
        for old, new in replacements.items():
            line = line.replace(old, new)
        new_file.write(line)   
 import requests
 import time
 import cv2
 from urllib.parse import urlparse
 from tqdm import tqdm
 # 测试HTTP连接并尝试下载数据
 def test_connectivity_and_download(url, initial_timeout=0.5, retry_timeout=1):
    parsed_url = urlparse(url)
    if parsed_url.scheme not in ['http', 'https']:
        # 非HTTP(s)协议，尝试RTSP检测
        return test_rtsp_connectivity(url, retry_timeout)
    else:
        # HTTP(s)协议，使用原始方法
        try:
            with requests.get(url, stream=True, timeout=initial_timeout) as response:
                if response.status_code == 200:
                    start_time = time.time()
                    while time.time() - start_time < initial_timeout:
                        chunk = response.raw.read(4096)  # 尝试下载1KB数据
                        if chunk:
                            return True  # 成功下载数据
        except requests.RequestException as e:
            print(f"请求异常: {e}")
            pass #这行删掉则会在下载不到数据流的时候进行连通性测试
    return False  # 默认返回False
 print("/" * 80)
 # 测试RTSP连接并尝试读取流
 def test_rtsp_connectivity(url, timeout=3):
    cap = cv2.VideoCapture(url)
    if not cap.isOpened():
        return False
    start_time = time.time()
    while time.time() - start_time < timeout:
        ret, _ = cap.read()
        if ret:
            return True  # 成功读取帧
    cap.release()
    return False
 # 主函数
 def main(输入, 输出):
    with open(输入, "r", encoding="utf-8") as source_file:
        lines = source_file.readlines()
    results = []
    for line_number, line in enumerate(tqdm(lines, desc="检测中")):
        parts = line.strip().split(",")
        if len(parts) == 2 and parts[1]:  # 确保有URL，并且URL不为空
            channel_name, channel_url = parts
            try:
                is_valid = test_connectivity_and_download(channel_url)
            except Exception as e:
                print(f"检测URL {channel_url} 时发生错误: {e}")
                is_valid = False  # 将异常的URL视为无效
            status = "有效" if is_valid else "无效"
            if "genre" in line.lower() or status == "有效":
                results.append((channel_name.strip(), channel_url.strip(), status))
    # 写入文件
    with open(输出, "w", encoding="utf-8") as output_file:
        for channel_name, channel_url, status in results:
            output_file.write(f"{channel_name},{channel_url}\n")
    print(f"任务完成, 有效源数量: {len([x for x in results if x[2] == '有效'])}, 无效源数量: {len([x for x in results if x[2] == '无效'])}")
 if __name__ == "__main__":
    输入 =  "gat.txt"    #input('请输入utf-8编码的直播源文件路径:')
    输出 = "gat.txt"
    main(输入, 输出)
 #######################普通排序
 with open('gat.txt', 'r', encoding='UTF-8') as f:
    lines = f.readlines()
 lines.sort()
 with open('gat.txt', 'w', encoding='UTF-8') as f:
    for line in lines:
        f.write(line)
 #######################拼音排序
 import re
 from pypinyin import lazy_pinyin
 # 打开一个utf-8编码的文本文件
 with open("gat.txt", "r", encoding="utf-8") as file:
    # 读取所有行并存储到列表中
    lines = file.readlines()
 # 定义一个函数，用于提取每行的第一个数字
 def extract_first_number(line):
    match = re.search(r'\d+', line)
    return int(match.group()) if match else float('inf')
 # 对列表中的行进行排序，按照第一个数字的大小排列，其余行按中文排序
 sorted_lines = sorted(lines, key=lambda x: (not 'CCTV' in x, extract_first_number(x) if 'CCTV' in x else lazy_pinyin(x.strip())))
 # 将排序后的行写入新的utf-8编码的文本文件
 with open("gat.txt", "w", encoding="utf-8") as file:
    for line in sorted_lines:
        file.write(line)
 print("任务运行完毕，分类频道列表可查看文件夹内综合源.txt文件！")