From 838c190fac0253e42fdab0fa03b559f9b9bbf96e Mon Sep 17 00:00:00 2001
From: frxz751113 <156018267+frxz751113@users.noreply.github.com>
Date: Wed, 28 Aug 2024 23:05:48 +0800
Subject: [PATCH] =?UTF-8?q?Update=20=E7=BD=91=E7=BB=9C=E6=94=B6=E9=9B=86.p?=
 =?UTF-8?q?y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 py/网络收集.py | 170 ++++++++++++++++++++++---------------------------
 1 file changed, 77 insertions(+), 93 deletions(-)

diff --git a/py/网络收集.py b/py/网络收集.py
index 43a6ae1..56ccce7 100644
--- a/py/网络收集.py
+++ b/py/网络收集.py
@@ -86,30 +86,6 @@ merge_txt_files(urls)
 
 
 
-with open('汇总.txt', 'r', encoding="utf-8") as file:
-    # 读取所有行并存储到列表中
-    lines = file.readlines()
-#定义替换规则的字典对频道名替换
-replacements = {
-    	" ": ""
-}
-with open('汇总.txt', 'w', encoding='utf-8') as new_file:
-    for line in lines:
-        # 去除行尾的换行符
-        line = line.rstrip('\n')
-        # 分割行，获取逗号前的字符串
-        parts = line.split(',', 1)
-        if len(parts) > 0:
-            # 替换逗号前的字符串
-            before_comma = parts[0]
-            for old, new in replacements.items():
-                before_comma = before_comma.replace(old, new)
-            # 将替换后的逗号前部分和逗号后部分重新组合成一行，并写入新文件
-            new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
-            new_file.write(new_line)
-
-
-
 
 
 
@@ -129,66 +105,10 @@ with open('汇总.txt', 'w', encoding='utf-8') as file:
 
 
 
-
-
-
-# 打开文本文件进行读取
-def read_and_process_file(input_filename, output_filename, encodings=['utf-8', 'gbk']):
-    for encoding in encodings:
-        try:
-            with open(input_filename, 'r', encoding=encoding) as file:
-                lines = file.readlines()
-                break
-        except UnicodeDecodeError:
-            continue
-    else:
-        raise ValueError(f"Cannot decode file '{input_filename}' with any of the provided encodings")
-
-    with open(output_filename, 'w', encoding='utf-8') as outfile:
-        for line in lines:
-            if '$' in line:
-                processed_line = line.split('$')[0].rstrip('\n')
-                outfile.write(processed_line + '\n')
-            else:
-                outfile.write(line)
-
-# 调用函数
-read_and_process_file('汇总.txt', '汇总.txt')  # 修改输出文件名以避免覆盖原始文件
-
-###################################################################去重#####################################
-def remove_duplicates(input_file, output_file):
-    # 用于存储已经遇到的URL和包含genre的行
-    seen_urls = set()
-    seen_lines_with_genre = set()
-    # 用于存储最终输出的行
-    output_lines = []
-    # 打开输入文件并读取所有行
-    with open(input_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        print("去重前的行数：", len(lines))
-        # 遍历每一行
-        for line in lines:
-            # 使用正则表达式查找URL和包含genre的行,默认最后一行
-            urls = re.findall(r'[https]?[http]?[rtsp]?[rtmp]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
-            genre_line = re.search(r'\bgenre\b', line, re.IGNORECASE) is not None
-            # 如果找到URL并且该URL尚未被记录
-            if urls and urls[0] not in seen_urls:
-                seen_urls.add(urls[0])
-                output_lines.append(line)
-            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
-            if genre_line:
-                output_lines.append(line)
-    # 将结果写入输出文件
-    with open(output_file, 'w', encoding='utf-8') as f:
-        f.writelines(output_lines)
-    print("去重后的行数：", len(output_lines))
-# 使用方法
-remove_duplicates('汇总.txt', '汇总.txt')   
-
-###############################################################################替换#########################
-# 导入fileinput模块
-import fileinput
-# 定义替换规则的字典
+with open('汇总.txt', 'r', encoding="utf-8") as file:
+    # 读取所有行并存储到列表中
+    lines = file.readlines()
+#定义替换规则的字典对频道名替换
 replacements = {
     	"CCTV-1高清测试": "",
     	"CCTV-2高清测试": "",
@@ -242,6 +162,7 @@ replacements = {
     	"CCTW": "CCTV",
     	"试看": "",
     	"测试": "",
+    	" ": "",
     	"测试cctv": "CCTV",
     	"CCTV1综合": "CCTV1",
     	"CCTV2财经": "CCTV2",
@@ -312,16 +233,79 @@ replacements = {
     	"CCTV7CCTV7": "CCTV7",
     	"CCTV10CCTV10": "CCTV10"
 }
-# 打开原始文件读取内容，并写入新文件
-with open('汇总.txt', 'r', encoding='utf-8') as file:
-    lines = file.readlines()
-# 创建新文件并写入替换后的内容
-with open('2.txt', 'w', encoding='utf-8') as new_file:
+with open('汇总.txt', 'w', encoding='utf-8') as new_file:
     for line in lines:
-        for old, new in replacements.items():
-            line = line.replace(old, new)
-        new_file.write(line)
-print("替换完成，新文件已保存。")
+        # 去除行尾的换行符
+        line = line.rstrip('\n')
+        # 分割行，获取逗号前的字符串
+        parts = line.split(',', 1)
+        if len(parts) > 0:
+            # 替换逗号前的字符串
+            before_comma = parts[0]
+            for old, new in replacements.items():
+                before_comma = before_comma.replace(old, new)
+            # 将替换后的逗号前部分和逗号后部分重新组合成一行，并写入新文件
+            new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
+            new_file.write(new_line)
+
+
+
+
+
+
+# 打开文本文件进行读取
+def read_and_process_file(input_filename, output_filename, encodings=['utf-8', 'gbk']):
+    for encoding in encodings:
+        try:
+            with open(input_filename, 'r', encoding=encoding) as file:
+                lines = file.readlines()
+                break
+        except UnicodeDecodeError:
+            continue
+    else:
+        raise ValueError(f"Cannot decode file '{input_filename}' with any of the provided encodings")
+
+    with open(output_filename, 'w', encoding='utf-8') as outfile:
+        for line in lines:
+            if '$' in line:
+                processed_line = line.split('$')[0].rstrip('\n')
+                outfile.write(processed_line + '\n')
+            else:
+                outfile.write(line)
+
+# 调用函数
+read_and_process_file('汇总.txt', '汇总.txt')  # 修改输出文件名以避免覆盖原始文件
+
+###################################################################去重#####################################
+def remove_duplicates(input_file, output_file):
+    # 用于存储已经遇到的URL和包含genre的行
+    seen_urls = set()
+    seen_lines_with_genre = set()
+    # 用于存储最终输出的行
+    output_lines = []
+    # 打开输入文件并读取所有行
+    with open(input_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+        print("去重前的行数：", len(lines))
+        # 遍历每一行
+        for line in lines:
+            # 使用正则表达式查找URL和包含genre的行,默认最后一行
+            urls = re.findall(r'[https]?[http]?[rtsp]?[rtmp]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
+            genre_line = re.search(r'\bgenre\b', line, re.IGNORECASE) is not None
+            # 如果找到URL并且该URL尚未被记录
+            if urls and urls[0] not in seen_urls:
+                seen_urls.add(urls[0])
+                output_lines.append(line)
+            # 如果找到包含genre的行，无论是否已被记录，都写入新文件
+            if genre_line:
+                output_lines.append(line)
+    # 将结果写入输出文件
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.writelines(output_lines)
+    print("去重后的行数：", len(output_lines))
+# 使用方法
+remove_duplicates('汇总.txt', '2.txt')   
+