Update GAT.py

2024-10-05 10:20:19 +08:00
parent d711f81928
commit bfc2e63472
1 changed files with 60 additions and 54 deletions
@@ -81,7 +81,7 @@ def merge_txt_files(urls, output_filename='汇总.txt'):
                try:
                    response = requests.get(url)
                    response.raise_for_status()  # 确保请求成功
-                    # 尝试将响应内容解码为UTF-8，如果失败则尝试其他编码
+                    # 尝试将响应内容解码为UTF-8,如果失败则尝试其他编码
                    try:
                        content = response.content.decode('utf-8')
                    except UnicodeDecodeError:
@@ -264,14 +264,14 @@ with open('汇总.txt', 'w', encoding='utf-8') as new_file:
    for line in lines:
        # 去除行尾的换行符
        line = line.rstrip('\n')
-        # 分割行，获取逗号前的字符串
+        # 分割行,获取逗号前的字符串
        parts = line.split(',', 1)
        if len(parts) > 0:
            # 替换逗号前的字符串
            before_comma = parts[0]
            for old, new in replacements.items():
                before_comma = before_comma.replace(old, new)
-            # 将替换后的逗号前部分和逗号后部分重新组合成一行，并写入新文件
+            # 将替换后的逗号前部分和逗号后部分重新组合成一行,并写入新文件
            new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
            new_file.write(new_line)

@@ -323,7 +323,7 @@ def remove_duplicates(input_file, output_file):
            if urls and urls[0] not in seen_urls:
                seen_urls.add(urls[0])
                output_lines.append(line)
-            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
+            # 如果找到包含genre的行,无论是否已被记录,都写入新文件
            if genre_line:
                output_lines.append(line)
    # 将结果写入输出文件
@@ -362,7 +362,7 @@ with open('2.txt', 'r', encoding='utf-8') as file:
        for line in lines:
            # 首先检查行是否包含任何提取关键词
            if any(keyword in line for keyword in extract_keywords):
-                # 如果包含提取关键词，进一步检查行是否不包含任何排除关键词
+                # 如果包含提取关键词,进一步检查行是否不包含任何排除关键词
                if not any(keyword in line for keyword in excluded_keywords):
                    outfile.write(line)  # 写入符合条件的行到文件

@@ -378,7 +378,7 @@ def parse_file(input_file_path, output_file_name):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
-            # 如果行是分类标签行，则跳过
+            # 如果行是分类标签行,则跳过
            if ",#genre#" in line:
                continue
            # 检查行是否包含IP或域名
@@ -386,7 +386,7 @@ def parse_file(input_file_path, output_file_name):
            if match:
                # 提取匹配到的IP或域名
                matched_text = match.group(1)
-                # 去除IP或域名后的剩余部分，只保留匹配到的IP或域名
+                # 去除IP或域名后的剩余部分,只保留匹配到的IP或域名
                ip_or_domain = matched_text.split('://')[-1].split('/')[0].split('::')[0]
                # 将行添加到对应的IP或域名列表中
                if ip_or_domain not in ip_or_domain_to_lines:
@@ -395,9 +395,9 @@ def parse_file(input_file_path, output_file_name):
    ############################################################################### 过滤掉小于1500字节的IP或域名段
    filtered_ip_or_domain_to_lines = {ip_or_domain: lines for ip_or_domain, lines in ip_or_domain_to_lines.items()
                                      if sum(len(line) for line in lines) >= 300}
-    # 如果没有满足条件的IP或域名段，则不生成文件
+    # 如果没有满足条件的IP或域名段,则不生成文件
    if not filtered_ip_or_domain_to_lines:
-        print("没有满足条件的IP或域名段，不生成文件。")
+        print("没有满足条件的IP或域名段,不生成文件。")
        return
    # 合并所有满足条件的IP或域名的行到一个文件
    with open(output_file_name, 'w', encoding='utf-8') as output_file:
@@ -426,7 +426,7 @@ with open(output_file, 'w', encoding='utf-8') as f:
            f.write(text_content + '\n')
            print(f"{keyword}获取完成")        
        else:
-            print(f'请求 {url} 失败，状态码：{response.status_code}')
+            print(f'请求 {url} 失败,状态码：{response.status_code}')
        time.sleep(1)  # 添加 1 秒的延迟
 with open('gat.txt', 'r', encoding='utf-8') as infile:
    lines = infile.readlines()
@@ -460,7 +460,7 @@ with open(output_file, 'w', encoding='utf-8') as f:
                except Exception as e:
                    print(f"Error parsing content for keyword {keyword}: {e}")
            else:
-                print(f'请求 {url} 失败，状态码：{response.status_code}')
+                print(f'请求 {url} 失败,状态码：{response.status_code}')
        except Exception as e:
            print(f"Error fetching URL for keyword {keyword}: {e}")
        time.sleep(1)  # 添加 1 秒的延迟
@@ -501,7 +501,7 @@ def remove_duplicates(input_file, output_file):
            if urls and urls[0] not in seen_urls:
                seen_urls.add(urls[0])
                output_lines.append(line)
-            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
+            # 如果找到包含genre的行,无论是否已被记录,都写入新文件
            if genre_line:
                output_lines.append(line)
    # 将结果写入输出文件
@@ -510,12 +510,12 @@ def remove_duplicates(input_file, output_file):
    print("去重后的行数：", len(output_lines))
 # 使用方法
 remove_duplicates('gat.txt', '网络收集.txt')
-print("处理完成，去重完成")
+print("处理完成,去重完成")

 ############################################ 假设要打开的文本文件名为*.txt
 with open('网络收集.txt', 'r', encoding='utf-8') as f:
    content1 = f.read()
-# 查找以'网络收集'命名的文件，可以遍历当前目录进行查找
+# 查找以'网络收集'命名的文件,可以遍历当前目录进行查找
 for filename in os.listdir():
    if '网络收集' in filename:
        with open(filename, 'r', encoding='utf-8') as f:
@@ -539,11 +539,11 @@ unique_lines = []
 seen_lines = set() 
 # 打印去重前的行数
 print(f"去重前的行数: {len(lines)}")
-# 遍历每一行，如果是新的就加入unique_lines 
+# 遍历每一行,如果是新的就加入unique_lines 
 for line in lines:
    line_stripped = line.strip()  # 去除行尾的换行符
    if line_stripped not in seen_lines:
-        unique_lines.append(line)  # 保持原始行的格式，包括换行符
+        unique_lines.append(line)  # 保持原始行的格式,包括换行符
        seen_lines.add(line_stripped)
 # 将唯一的行写入新的文档 
 with open('网络收集.txt', 'w', encoding="utf-8") as file:
@@ -555,7 +555,7 @@ print(f"去重后的行数: {len(unique_lines)}")

 ################简体转繁体
 #################################################################
-# 创建一个OpenCC对象，指定转换的规则为繁体字转简体字
+# 创建一个OpenCC对象,指定转换的规则为繁体字转简体字
 converter = OpenCC('t2s.json')#繁转简
 #converter = OpenCC('s2t.json')#简转繁
 # 打开txt文件
@@ -566,14 +566,14 @@ simplified_text = converter.convert(traditional_text)
 # 将转换后的简体字写入txt文件
 with open('网络收集.txt', 'w', encoding='utf-8') as file:
    file.write(simplified_text)
-print("处理完成，繁体转换完成")
+print("处理完成,繁体转换完成")


 ######################################################################################提取goodiptv
 import re
 import os
 # 定义一个包含所有要排除的关键词的列表
-excluded_keywords = ['zhoujie218', 'service', '112114', 'xfjcHD', '安徽', '文化', '电影台61', 'stream8.jlntv', 'live.cooltv', 'P2P', 'Plus', '38.64.72.148', '新闻综合', 'P3p', 'cookies', '9930/qilu', 'gitcode.net', 'free.xiptv']   #, '', ''
+excluded_keywords = ['zhoujie218', 'service', '112114', 'xfjcHD', '安徽', '七彩', '文化', '电影台61', 'youtube', 'stream8.jlntv', 'live.cooltv', 'P2P', 'Plus', '38.64.72.148', '新闻综合', 'P3p', 'cookies', '9930/qilu', 'gitcode.net', 'free.xiptv']   #, '', ''
 # 定义一个包含所有要提取的关键词的列表
 extract_keywords = [',']
 # 读取文件并处理每一行
@@ -584,7 +584,7 @@ with open('网络收集.txt', 'r', encoding='utf-8') as file:
        for line in lines:
            # 首先检查行是否包含任何提取关键词
            if any(keyword in line for keyword in extract_keywords):
-                # 如果包含提取关键词，进一步检查行是否不包含任何排除关键词
+                # 如果包含提取关键词,进一步检查行是否不包含任何排除关键词
                if not any(keyword in line for keyword in excluded_keywords):
                    outfile.write(line)  # 写入符合条件的行到文件

@@ -621,7 +621,7 @@ print("/" * 80)
 replacements = {
        " ": "",
 }
-# 打开原始文件读取内容，并写入新文件
+# 打开原始文件读取内容,并写入新文件
 with open('网络收集.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
 # 创建新文件并写入替换后的内容
@@ -642,10 +642,10 @@ from tqdm import tqdm
 def test_connectivity_and_download(url, initial_timeout=1, retry_timeout=1):
    parsed_url = urlparse(url)
    if parsed_url.scheme not in ['http', 'https']:
-        # 非HTTP(s)协议，尝试RTSP检测
+        # 非HTTP(s)协议,尝试RTSP检测
        return test_rtsp_connectivity(url, retry_timeout)
    else:
-        # HTTP(s)协议，使用原始方法
+        # HTTP(s)协议,使用原始方法
        try:
            with requests.get(url, stream=True, timeout=initial_timeout) as response:
                if response.status_code == 200:
@@ -678,7 +678,7 @@ def main(输入, 输出):
    results = []
    for line_number, line in enumerate(tqdm(lines, desc="检测中")):
        parts = line.strip().split(",")
-        if len(parts) == 2 and parts[1]:  # 确保有URL，并且URL不为空
+        if len(parts) == 2 and parts[1]:  # 确保有URL,并且URL不为空
            channel_name, channel_url = parts
            try:
                is_valid = test_connectivity_and_download(channel_url)
@@ -717,11 +717,11 @@ from pypinyin import lazy_pinyin
 with open("网络收集.txt", "r", encoding="utf-8") as file:
    # 读取所有行并存储到列表中
    lines = file.readlines()
-# 定义一个函数，用于提取每行的第一个数字
+# 定义一个函数,用于提取每行的第一个数字
 def extract_first_number(line):
    match = re.search(r'\d+', line)
    return int(match.group()) if match else float('inf')
-# 对列表中的行进行排序，按照第一个数字的大小排列，其余行按中文排序
+# 对列表中的行进行排序,按照第一个数字的大小排列,其余行按中文排序
 sorted_lines = sorted(lines, key=lambda x: (not 'CCTV' in x, extract_first_number(x) if 'CCTV' in x else lazy_pinyin(x.strip())))
 # 将排序后的行写入新的utf-8编码的文本文件
 with open("网络收集.txt", "w", encoding="utf-8") as file:
@@ -753,11 +753,17 @@ replacements = {
    	"」": "",
    	"标清": "",
    	"-": "",
-    	"": "",
-    	"": "",
-    	"": "",
-    	"": "",
-    	"": "",
+    	"NEWS": "新闻",
+    	"台,": ",",
+    	"歌厅": "",
+    	"秀,": ",",
+    	"TVBJade": "TVB",
+    	"CantoneseSubtitles": "",
+    	"+": "",
+    	"财经新闻": "财经",
+    	"番薯,": "番薯音乐,",
+    	"番薯111,": "番薯音乐,",
+    	"凤凰资讯,http://61.221": "中天娱乐,http://61.221",
    	"": "",
    	"": "",
    	"[1080p]": "",
@@ -940,18 +946,18 @@ with open('2.txt', 'w', encoding='utf-8') as new_file:
    for line in lines:
        # 去除行尾的换行符
        line = line.rstrip('\n')
-        # 分割行，获取逗号前的字符串
+        # 分割行,获取逗号前的字符串
        parts = line.split(',', 1)
        if len(parts) > 0:
            # 替换逗号前的字符串
            before_comma = parts[0]
            for old, new in replacements.items():
                before_comma = before_comma.replace(old, new)
-            # 将替换后的逗号前部分和逗号后部分重新组合成一行，并写入新文件
+            # 将替换后的逗号前部分和逗号后部分重新组合成一行,并写入新文件
            new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
            new_file.write(new_line)

-print("替换完成，新文件已保存。")
+print("替换完成,新文件已保存。")



@@ -970,7 +976,7 @@ def check_and_write_file(input_file, output_file, keywords):
            if re.search(pattern, line):
                extracted_lines.append(line)

-    # 如果至少提取到一行，写入头部信息和提取的行到输出文件
+    # 如果至少提取到一行,写入头部信息和提取的行到输出文件
    if extracted_lines:
        with open(output_file, 'w', encoding='utf-8') as out_file:
            out_file.write(f"{keywords_list[0]},#genre#\n")  # 写入头部信息
@@ -982,14 +988,14 @@ def check_and_write_file(input_file, output_file, keywords):
        # 检查文件的总大小
        file_size = os.path.getsize(output_file)
        
-        # 如果文件大小小于30字节（假设的最小文件大小），删除文件
+        # 如果文件大小小于30字节（假设的最小文件大小）,删除文件
        if file_size < 20:
            os.remove(output_file)
-            print(f"文件只包含头部信息，{output_file} 已被删除。")
+            print(f"文件只包含头部信息,{output_file} 已被删除。")
        else:
            print(f"文件已提取关键词并保存为: {output_file}")
    else:
-        print(f"未提取到关键词，不创建输出文件 {output_file}。")
+        print(f"未提取到关键词,不创建输出文件 {output_file}。")

 # 按类别提取关键词并写入文件
 #check_and_write_file('2.txt','a0.txt',keywords="央视频道1,CCTV")
@@ -1014,14 +1020,14 @@ for file_path in file_paths:
        with open(file_path, 'r', encoding="utf-8") as file:
            content = file.read()
            file_contents.append(content)
-    else:                # 如果文件不存在，则提示异常并打印提示信息
-        print(f"文件 {file_path} 不存在，跳过")
+    else:                # 如果文件不存在,则提示异常并打印提示信息
+        print(f"文件 {file_path} 不存在,跳过")
 # 写入合并后的文件
 with open("去重.txt", "w", encoding="utf-8") as output:
    output.write('\n'.join(file_contents))

 ###############################################################################################################################################################################################################################
-##############################################################对生成的文件进行网址及文本去重复，避免同一个频道出现在不同的类中
+##############################################################对生成的文件进行网址及文本去重复,避免同一个频道出现在不同的类中

 def remove_duplicates(input_file, output_file):
    # 用于存储已经遇到的URL和包含genre的行
@@ -1042,7 +1048,7 @@ def remove_duplicates(input_file, output_file):
            if urls and urls[0] not in seen_urls:
                seen_urls.add(urls[0])
                output_lines.append(line)
-            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
+            # 如果找到包含genre的行,无论是否已被记录,都写入新文件
            if genre_line:
                output_lines.append(line)
    # 将结果写入输出文件
@@ -1061,7 +1067,7 @@ with open('网络收集.txt', 'r', encoding="utf-8") as file:
 unique_lines = [] 
 seen_lines = set() 

-# 遍历每一行，如果是新的就加入unique_lines 
+# 遍历每一行,如果是新的就加入unique_lines 
 for line in lines:
 if line not in seen_lines:
  unique_lines.append(line)
@@ -1075,7 +1081,7 @@ with open('网络收集.txt', 'w', encoding="utf-8") as file:



-################################################################################################任务结束，删除不必要的过程文件
+################################################################################################任务结束,删除不必要的过程文件
 files_to_remove = ['去重.txt', "2.txt", "a0.txt", "a.txt", "a1.txt", "b0.txt", "b.txt", "c.txt", "c1.txt", "c2.txt", "d.txt", "e.txt", "f0.txt", "f.txt", "f1.txt", "g0.txt", "g.txt", "g1.txt", "h0.txt", "h.txt", "h1.txt", "i.txt", \
              "i1.txt", "j.txt", "j1.txt", "k.txt", "l0.txt", "l.txt", "l1.txt", "m.txt", "m1.txt",  \
              "n0.txt","n.txt","n1.txt", "o1.txt", "o.txt", "p.txt"]
@@ -1083,10 +1089,10 @@ files_to_remove = ['去重.txt', "2.txt", "a0.txt", "a.txt", "a1.txt", "b0.txt",
 for file in files_to_remove:
    if os.path.exists(file):
        os.remove(file)
-    else:              # 如果文件不存在，则提示异常并打印提示信息
-        print(f"文件 {file} 不存在，跳过删除。")
+    else:              # 如果文件不存在,则提示异常并打印提示信息
+        print(f"文件 {file} 不存在,跳过删除。")

-print("任务运行完毕，gat频道列表可查看文件夹内综合源.txt文件！")
+print("任务运行完毕,gat频道列表可查看文件夹内综合源.txt文件！")


 def append_text_between_files(file1_path, file2_path):
@@ -1115,7 +1121,7 @@ file_path1 = '网络收集.txt'
 file_path2 = '综合源.txt'
 append_text_between_files(file_path1, file_path2)

-##############################################################对生成的文件进行网址及文本去重复，避免同一个频道出现在不同的类中
+##############################################################对生成的文件进行网址及文本去重复,避免同一个频道出现在不同的类中

 def remove_duplicates(input_file, output_file):
    # 用于存储已经遇到的URL和包含genre的行
@@ -1136,7 +1142,7 @@ def remove_duplicates(input_file, output_file):
            if urls and urls[0] not in seen_urls:
                seen_urls.add(urls[0])
                output_lines.append(line)
-            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
+            # 如果找到包含genre的行,无论是否已被记录,都写入新文件
            if genre_line:
                output_lines.append(line)
    # 将结果写入输出文件
@@ -1216,14 +1222,14 @@ with open(file_path, 'r+', encoding='utf-8') as f:

     

-################################################################################################任务结束，删除不必要的过程文件
+################################################################################################任务结束,删除不必要的过程文件
 files_to_remove = ["gat.txt", "汇总.txt"]
 for file in files_to_remove:
    if os.path.exists(file):
        os.remove(file)
-    else:              # 如果文件不存在，则提示异常并打印提示信息
-        print(f"文件 {file} 不存在，跳过删除。")
-print("任务运行完毕，频道列表可查看文件夹内源.txt文件！")
+    else:              # 如果文件不存在,则提示异常并打印提示信息
+        print(f"文件 {file} 不存在,跳过删除。")
+print("任务运行完毕,频道列表可查看文件夹内源.txt文件！")

 import os
 import datetime
@@ -1270,4 +1276,4 @@ delete_nonstandard_files()
 rename_standard_files()


-print("任务运行完毕，gat频道列表可查看文件夹内综合源.txt文件！")
+print("任务运行完毕,gat频道列表可查看文件夹内综合源.txt文件！")