Update GAT.py
This commit is contained in:
@@ -81,7 +81,7 @@ def merge_txt_files(urls, output_filename='汇总.txt'):
|
|||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status() # 确保请求成功
|
response.raise_for_status() # 确保请求成功
|
||||||
# 尝试将响应内容解码为UTF-8,如果失败则尝试其他编码
|
# 尝试将响应内容解码为UTF-8,如果失败则尝试其他编码
|
||||||
try:
|
try:
|
||||||
content = response.content.decode('utf-8')
|
content = response.content.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
@@ -264,14 +264,14 @@ with open('汇总.txt', 'w', encoding='utf-8') as new_file:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
# 去除行尾的换行符
|
# 去除行尾的换行符
|
||||||
line = line.rstrip('\n')
|
line = line.rstrip('\n')
|
||||||
# 分割行,获取逗号前的字符串
|
# 分割行,获取逗号前的字符串
|
||||||
parts = line.split(',', 1)
|
parts = line.split(',', 1)
|
||||||
if len(parts) > 0:
|
if len(parts) > 0:
|
||||||
# 替换逗号前的字符串
|
# 替换逗号前的字符串
|
||||||
before_comma = parts[0]
|
before_comma = parts[0]
|
||||||
for old, new in replacements.items():
|
for old, new in replacements.items():
|
||||||
before_comma = before_comma.replace(old, new)
|
before_comma = before_comma.replace(old, new)
|
||||||
# 将替换后的逗号前部分和逗号后部分重新组合成一行,并写入新文件
|
# 将替换后的逗号前部分和逗号后部分重新组合成一行,并写入新文件
|
||||||
new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
|
new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
|
||||||
new_file.write(new_line)
|
new_file.write(new_line)
|
||||||
|
|
||||||
@@ -323,7 +323,7 @@ def remove_duplicates(input_file, output_file):
|
|||||||
if urls and urls[0] not in seen_urls:
|
if urls and urls[0] not in seen_urls:
|
||||||
seen_urls.add(urls[0])
|
seen_urls.add(urls[0])
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
||||||
if genre_line:
|
if genre_line:
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 将结果写入输出文件
|
# 将结果写入输出文件
|
||||||
@@ -362,7 +362,7 @@ with open('2.txt', 'r', encoding='utf-8') as file:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
# 首先检查行是否包含任何提取关键词
|
# 首先检查行是否包含任何提取关键词
|
||||||
if any(keyword in line for keyword in extract_keywords):
|
if any(keyword in line for keyword in extract_keywords):
|
||||||
# 如果包含提取关键词,进一步检查行是否不包含任何排除关键词
|
# 如果包含提取关键词,进一步检查行是否不包含任何排除关键词
|
||||||
if not any(keyword in line for keyword in excluded_keywords):
|
if not any(keyword in line for keyword in excluded_keywords):
|
||||||
outfile.write(line) # 写入符合条件的行到文件
|
outfile.write(line) # 写入符合条件的行到文件
|
||||||
|
|
||||||
@@ -378,7 +378,7 @@ def parse_file(input_file_path, output_file_name):
|
|||||||
with open(input_file_path, 'r', encoding='utf-8') as file:
|
with open(input_file_path, 'r', encoding='utf-8') as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
# 如果行是分类标签行,则跳过
|
# 如果行是分类标签行,则跳过
|
||||||
if ",#genre#" in line:
|
if ",#genre#" in line:
|
||||||
continue
|
continue
|
||||||
# 检查行是否包含IP或域名
|
# 检查行是否包含IP或域名
|
||||||
@@ -386,7 +386,7 @@ def parse_file(input_file_path, output_file_name):
|
|||||||
if match:
|
if match:
|
||||||
# 提取匹配到的IP或域名
|
# 提取匹配到的IP或域名
|
||||||
matched_text = match.group(1)
|
matched_text = match.group(1)
|
||||||
# 去除IP或域名后的剩余部分,只保留匹配到的IP或域名
|
# 去除IP或域名后的剩余部分,只保留匹配到的IP或域名
|
||||||
ip_or_domain = matched_text.split('://')[-1].split('/')[0].split('::')[0]
|
ip_or_domain = matched_text.split('://')[-1].split('/')[0].split('::')[0]
|
||||||
# 将行添加到对应的IP或域名列表中
|
# 将行添加到对应的IP或域名列表中
|
||||||
if ip_or_domain not in ip_or_domain_to_lines:
|
if ip_or_domain not in ip_or_domain_to_lines:
|
||||||
@@ -395,9 +395,9 @@ def parse_file(input_file_path, output_file_name):
|
|||||||
############################################################################### 过滤掉小于1500字节的IP或域名段
|
############################################################################### 过滤掉小于1500字节的IP或域名段
|
||||||
filtered_ip_or_domain_to_lines = {ip_or_domain: lines for ip_or_domain, lines in ip_or_domain_to_lines.items()
|
filtered_ip_or_domain_to_lines = {ip_or_domain: lines for ip_or_domain, lines in ip_or_domain_to_lines.items()
|
||||||
if sum(len(line) for line in lines) >= 300}
|
if sum(len(line) for line in lines) >= 300}
|
||||||
# 如果没有满足条件的IP或域名段,则不生成文件
|
# 如果没有满足条件的IP或域名段,则不生成文件
|
||||||
if not filtered_ip_or_domain_to_lines:
|
if not filtered_ip_or_domain_to_lines:
|
||||||
print("没有满足条件的IP或域名段,不生成文件。")
|
print("没有满足条件的IP或域名段,不生成文件。")
|
||||||
return
|
return
|
||||||
# 合并所有满足条件的IP或域名的行到一个文件
|
# 合并所有满足条件的IP或域名的行到一个文件
|
||||||
with open(output_file_name, 'w', encoding='utf-8') as output_file:
|
with open(output_file_name, 'w', encoding='utf-8') as output_file:
|
||||||
@@ -426,7 +426,7 @@ with open(output_file, 'w', encoding='utf-8') as f:
|
|||||||
f.write(text_content + '\n')
|
f.write(text_content + '\n')
|
||||||
print(f"{keyword}获取完成")
|
print(f"{keyword}获取完成")
|
||||||
else:
|
else:
|
||||||
print(f'请求 {url} 失败,状态码:{response.status_code}')
|
print(f'请求 {url} 失败,状态码:{response.status_code}')
|
||||||
time.sleep(1) # 添加 1 秒的延迟
|
time.sleep(1) # 添加 1 秒的延迟
|
||||||
with open('gat.txt', 'r', encoding='utf-8') as infile:
|
with open('gat.txt', 'r', encoding='utf-8') as infile:
|
||||||
lines = infile.readlines()
|
lines = infile.readlines()
|
||||||
@@ -460,7 +460,7 @@ with open(output_file, 'w', encoding='utf-8') as f:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error parsing content for keyword {keyword}: {e}")
|
print(f"Error parsing content for keyword {keyword}: {e}")
|
||||||
else:
|
else:
|
||||||
print(f'请求 {url} 失败,状态码:{response.status_code}')
|
print(f'请求 {url} 失败,状态码:{response.status_code}')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error fetching URL for keyword {keyword}: {e}")
|
print(f"Error fetching URL for keyword {keyword}: {e}")
|
||||||
time.sleep(1) # 添加 1 秒的延迟
|
time.sleep(1) # 添加 1 秒的延迟
|
||||||
@@ -501,7 +501,7 @@ def remove_duplicates(input_file, output_file):
|
|||||||
if urls and urls[0] not in seen_urls:
|
if urls and urls[0] not in seen_urls:
|
||||||
seen_urls.add(urls[0])
|
seen_urls.add(urls[0])
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
||||||
if genre_line:
|
if genre_line:
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 将结果写入输出文件
|
# 将结果写入输出文件
|
||||||
@@ -510,12 +510,12 @@ def remove_duplicates(input_file, output_file):
|
|||||||
print("去重后的行数:", len(output_lines))
|
print("去重后的行数:", len(output_lines))
|
||||||
# 使用方法
|
# 使用方法
|
||||||
remove_duplicates('gat.txt', '网络收集.txt')
|
remove_duplicates('gat.txt', '网络收集.txt')
|
||||||
print("处理完成,去重完成")
|
print("处理完成,去重完成")
|
||||||
|
|
||||||
############################################ 假设要打开的文本文件名为*.txt
|
############################################ 假设要打开的文本文件名为*.txt
|
||||||
with open('网络收集.txt', 'r', encoding='utf-8') as f:
|
with open('网络收集.txt', 'r', encoding='utf-8') as f:
|
||||||
content1 = f.read()
|
content1 = f.read()
|
||||||
# 查找以'网络收集'命名的文件,可以遍历当前目录进行查找
|
# 查找以'网络收集'命名的文件,可以遍历当前目录进行查找
|
||||||
for filename in os.listdir():
|
for filename in os.listdir():
|
||||||
if '网络收集' in filename:
|
if '网络收集' in filename:
|
||||||
with open(filename, 'r', encoding='utf-8') as f:
|
with open(filename, 'r', encoding='utf-8') as f:
|
||||||
@@ -539,11 +539,11 @@ unique_lines = []
|
|||||||
seen_lines = set()
|
seen_lines = set()
|
||||||
# 打印去重前的行数
|
# 打印去重前的行数
|
||||||
print(f"去重前的行数: {len(lines)}")
|
print(f"去重前的行数: {len(lines)}")
|
||||||
# 遍历每一行,如果是新的就加入unique_lines
|
# 遍历每一行,如果是新的就加入unique_lines
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line_stripped = line.strip() # 去除行尾的换行符
|
line_stripped = line.strip() # 去除行尾的换行符
|
||||||
if line_stripped not in seen_lines:
|
if line_stripped not in seen_lines:
|
||||||
unique_lines.append(line) # 保持原始行的格式,包括换行符
|
unique_lines.append(line) # 保持原始行的格式,包括换行符
|
||||||
seen_lines.add(line_stripped)
|
seen_lines.add(line_stripped)
|
||||||
# 将唯一的行写入新的文档
|
# 将唯一的行写入新的文档
|
||||||
with open('网络收集.txt', 'w', encoding="utf-8") as file:
|
with open('网络收集.txt', 'w', encoding="utf-8") as file:
|
||||||
@@ -555,7 +555,7 @@ print(f"去重后的行数: {len(unique_lines)}")
|
|||||||
|
|
||||||
################简体转繁体
|
################简体转繁体
|
||||||
#################################################################
|
#################################################################
|
||||||
# 创建一个OpenCC对象,指定转换的规则为繁体字转简体字
|
# 创建一个OpenCC对象,指定转换的规则为繁体字转简体字
|
||||||
converter = OpenCC('t2s.json')#繁转简
|
converter = OpenCC('t2s.json')#繁转简
|
||||||
#converter = OpenCC('s2t.json')#简转繁
|
#converter = OpenCC('s2t.json')#简转繁
|
||||||
# 打开txt文件
|
# 打开txt文件
|
||||||
@@ -566,14 +566,14 @@ simplified_text = converter.convert(traditional_text)
|
|||||||
# 将转换后的简体字写入txt文件
|
# 将转换后的简体字写入txt文件
|
||||||
with open('网络收集.txt', 'w', encoding='utf-8') as file:
|
with open('网络收集.txt', 'w', encoding='utf-8') as file:
|
||||||
file.write(simplified_text)
|
file.write(simplified_text)
|
||||||
print("处理完成,繁体转换完成")
|
print("处理完成,繁体转换完成")
|
||||||
|
|
||||||
|
|
||||||
######################################################################################提取goodiptv
|
######################################################################################提取goodiptv
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
# 定义一个包含所有要排除的关键词的列表
|
# 定义一个包含所有要排除的关键词的列表
|
||||||
excluded_keywords = ['zhoujie218', 'service', '112114', 'xfjcHD', '安徽', '文化', '电影台61', 'stream8.jlntv', 'live.cooltv', 'P2P', 'Plus', '38.64.72.148', '新闻综合', 'P3p', 'cookies', '9930/qilu', 'gitcode.net', 'free.xiptv'] #, '', ''
|
excluded_keywords = ['zhoujie218', 'service', '112114', 'xfjcHD', '安徽', '七彩', '文化', '电影台61', 'youtube', 'stream8.jlntv', 'live.cooltv', 'P2P', 'Plus', '38.64.72.148', '新闻综合', 'P3p', 'cookies', '9930/qilu', 'gitcode.net', 'free.xiptv'] #, '', ''
|
||||||
# 定义一个包含所有要提取的关键词的列表
|
# 定义一个包含所有要提取的关键词的列表
|
||||||
extract_keywords = [',']
|
extract_keywords = [',']
|
||||||
# 读取文件并处理每一行
|
# 读取文件并处理每一行
|
||||||
@@ -584,7 +584,7 @@ with open('网络收集.txt', 'r', encoding='utf-8') as file:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
# 首先检查行是否包含任何提取关键词
|
# 首先检查行是否包含任何提取关键词
|
||||||
if any(keyword in line for keyword in extract_keywords):
|
if any(keyword in line for keyword in extract_keywords):
|
||||||
# 如果包含提取关键词,进一步检查行是否不包含任何排除关键词
|
# 如果包含提取关键词,进一步检查行是否不包含任何排除关键词
|
||||||
if not any(keyword in line for keyword in excluded_keywords):
|
if not any(keyword in line for keyword in excluded_keywords):
|
||||||
outfile.write(line) # 写入符合条件的行到文件
|
outfile.write(line) # 写入符合条件的行到文件
|
||||||
|
|
||||||
@@ -621,7 +621,7 @@ print("/" * 80)
|
|||||||
replacements = {
|
replacements = {
|
||||||
" ": "",
|
" ": "",
|
||||||
}
|
}
|
||||||
# 打开原始文件读取内容,并写入新文件
|
# 打开原始文件读取内容,并写入新文件
|
||||||
with open('网络收集.txt', 'r', encoding='utf-8') as file:
|
with open('网络收集.txt', 'r', encoding='utf-8') as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
# 创建新文件并写入替换后的内容
|
# 创建新文件并写入替换后的内容
|
||||||
@@ -642,10 +642,10 @@ from tqdm import tqdm
|
|||||||
def test_connectivity_and_download(url, initial_timeout=1, retry_timeout=1):
|
def test_connectivity_and_download(url, initial_timeout=1, retry_timeout=1):
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
if parsed_url.scheme not in ['http', 'https']:
|
if parsed_url.scheme not in ['http', 'https']:
|
||||||
# 非HTTP(s)协议,尝试RTSP检测
|
# 非HTTP(s)协议,尝试RTSP检测
|
||||||
return test_rtsp_connectivity(url, retry_timeout)
|
return test_rtsp_connectivity(url, retry_timeout)
|
||||||
else:
|
else:
|
||||||
# HTTP(s)协议,使用原始方法
|
# HTTP(s)协议,使用原始方法
|
||||||
try:
|
try:
|
||||||
with requests.get(url, stream=True, timeout=initial_timeout) as response:
|
with requests.get(url, stream=True, timeout=initial_timeout) as response:
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@@ -678,7 +678,7 @@ def main(输入, 输出):
|
|||||||
results = []
|
results = []
|
||||||
for line_number, line in enumerate(tqdm(lines, desc="检测中")):
|
for line_number, line in enumerate(tqdm(lines, desc="检测中")):
|
||||||
parts = line.strip().split(",")
|
parts = line.strip().split(",")
|
||||||
if len(parts) == 2 and parts[1]: # 确保有URL,并且URL不为空
|
if len(parts) == 2 and parts[1]: # 确保有URL,并且URL不为空
|
||||||
channel_name, channel_url = parts
|
channel_name, channel_url = parts
|
||||||
try:
|
try:
|
||||||
is_valid = test_connectivity_and_download(channel_url)
|
is_valid = test_connectivity_and_download(channel_url)
|
||||||
@@ -717,11 +717,11 @@ from pypinyin import lazy_pinyin
|
|||||||
with open("网络收集.txt", "r", encoding="utf-8") as file:
|
with open("网络收集.txt", "r", encoding="utf-8") as file:
|
||||||
# 读取所有行并存储到列表中
|
# 读取所有行并存储到列表中
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
# 定义一个函数,用于提取每行的第一个数字
|
# 定义一个函数,用于提取每行的第一个数字
|
||||||
def extract_first_number(line):
|
def extract_first_number(line):
|
||||||
match = re.search(r'\d+', line)
|
match = re.search(r'\d+', line)
|
||||||
return int(match.group()) if match else float('inf')
|
return int(match.group()) if match else float('inf')
|
||||||
# 对列表中的行进行排序,按照第一个数字的大小排列,其余行按中文排序
|
# 对列表中的行进行排序,按照第一个数字的大小排列,其余行按中文排序
|
||||||
sorted_lines = sorted(lines, key=lambda x: (not 'CCTV' in x, extract_first_number(x) if 'CCTV' in x else lazy_pinyin(x.strip())))
|
sorted_lines = sorted(lines, key=lambda x: (not 'CCTV' in x, extract_first_number(x) if 'CCTV' in x else lazy_pinyin(x.strip())))
|
||||||
# 将排序后的行写入新的utf-8编码的文本文件
|
# 将排序后的行写入新的utf-8编码的文本文件
|
||||||
with open("网络收集.txt", "w", encoding="utf-8") as file:
|
with open("网络收集.txt", "w", encoding="utf-8") as file:
|
||||||
@@ -753,11 +753,17 @@ replacements = {
|
|||||||
"」": "",
|
"」": "",
|
||||||
"标清": "",
|
"标清": "",
|
||||||
"-": "",
|
"-": "",
|
||||||
"": "",
|
"NEWS": "新闻",
|
||||||
"": "",
|
"台,": ",",
|
||||||
"": "",
|
"歌厅": "",
|
||||||
"": "",
|
"秀,": ",",
|
||||||
"": "",
|
"TVBJade": "TVB",
|
||||||
|
"CantoneseSubtitles": "",
|
||||||
|
"+": "",
|
||||||
|
"财经新闻": "财经",
|
||||||
|
"番薯,": "番薯音乐,",
|
||||||
|
"番薯111,": "番薯音乐,",
|
||||||
|
"凤凰资讯,http://61.221": "中天娱乐,http://61.221",
|
||||||
"": "",
|
"": "",
|
||||||
"": "",
|
"": "",
|
||||||
"[1080p]": "",
|
"[1080p]": "",
|
||||||
@@ -940,18 +946,18 @@ with open('2.txt', 'w', encoding='utf-8') as new_file:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
# 去除行尾的换行符
|
# 去除行尾的换行符
|
||||||
line = line.rstrip('\n')
|
line = line.rstrip('\n')
|
||||||
# 分割行,获取逗号前的字符串
|
# 分割行,获取逗号前的字符串
|
||||||
parts = line.split(',', 1)
|
parts = line.split(',', 1)
|
||||||
if len(parts) > 0:
|
if len(parts) > 0:
|
||||||
# 替换逗号前的字符串
|
# 替换逗号前的字符串
|
||||||
before_comma = parts[0]
|
before_comma = parts[0]
|
||||||
for old, new in replacements.items():
|
for old, new in replacements.items():
|
||||||
before_comma = before_comma.replace(old, new)
|
before_comma = before_comma.replace(old, new)
|
||||||
# 将替换后的逗号前部分和逗号后部分重新组合成一行,并写入新文件
|
# 将替换后的逗号前部分和逗号后部分重新组合成一行,并写入新文件
|
||||||
new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
|
new_line = f'{before_comma},{parts[1]}\n' if len(parts) > 1 else f'{before_comma}\n'
|
||||||
new_file.write(new_line)
|
new_file.write(new_line)
|
||||||
|
|
||||||
print("替换完成,新文件已保存。")
|
print("替换完成,新文件已保存。")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -970,7 +976,7 @@ def check_and_write_file(input_file, output_file, keywords):
|
|||||||
if re.search(pattern, line):
|
if re.search(pattern, line):
|
||||||
extracted_lines.append(line)
|
extracted_lines.append(line)
|
||||||
|
|
||||||
# 如果至少提取到一行,写入头部信息和提取的行到输出文件
|
# 如果至少提取到一行,写入头部信息和提取的行到输出文件
|
||||||
if extracted_lines:
|
if extracted_lines:
|
||||||
with open(output_file, 'w', encoding='utf-8') as out_file:
|
with open(output_file, 'w', encoding='utf-8') as out_file:
|
||||||
out_file.write(f"{keywords_list[0]},#genre#\n") # 写入头部信息
|
out_file.write(f"{keywords_list[0]},#genre#\n") # 写入头部信息
|
||||||
@@ -982,14 +988,14 @@ def check_and_write_file(input_file, output_file, keywords):
|
|||||||
# 检查文件的总大小
|
# 检查文件的总大小
|
||||||
file_size = os.path.getsize(output_file)
|
file_size = os.path.getsize(output_file)
|
||||||
|
|
||||||
# 如果文件大小小于30字节(假设的最小文件大小),删除文件
|
# 如果文件大小小于30字节(假设的最小文件大小),删除文件
|
||||||
if file_size < 20:
|
if file_size < 20:
|
||||||
os.remove(output_file)
|
os.remove(output_file)
|
||||||
print(f"文件只包含头部信息,{output_file} 已被删除。")
|
print(f"文件只包含头部信息,{output_file} 已被删除。")
|
||||||
else:
|
else:
|
||||||
print(f"文件已提取关键词并保存为: {output_file}")
|
print(f"文件已提取关键词并保存为: {output_file}")
|
||||||
else:
|
else:
|
||||||
print(f"未提取到关键词,不创建输出文件 {output_file}。")
|
print(f"未提取到关键词,不创建输出文件 {output_file}。")
|
||||||
|
|
||||||
# 按类别提取关键词并写入文件
|
# 按类别提取关键词并写入文件
|
||||||
#check_and_write_file('2.txt','a0.txt',keywords="央视频道1,CCTV")
|
#check_and_write_file('2.txt','a0.txt',keywords="央视频道1,CCTV")
|
||||||
@@ -1014,14 +1020,14 @@ for file_path in file_paths:
|
|||||||
with open(file_path, 'r', encoding="utf-8") as file:
|
with open(file_path, 'r', encoding="utf-8") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
file_contents.append(content)
|
file_contents.append(content)
|
||||||
else: # 如果文件不存在,则提示异常并打印提示信息
|
else: # 如果文件不存在,则提示异常并打印提示信息
|
||||||
print(f"文件 {file_path} 不存在,跳过")
|
print(f"文件 {file_path} 不存在,跳过")
|
||||||
# 写入合并后的文件
|
# 写入合并后的文件
|
||||||
with open("去重.txt", "w", encoding="utf-8") as output:
|
with open("去重.txt", "w", encoding="utf-8") as output:
|
||||||
output.write('\n'.join(file_contents))
|
output.write('\n'.join(file_contents))
|
||||||
|
|
||||||
###############################################################################################################################################################################################################################
|
###############################################################################################################################################################################################################################
|
||||||
##############################################################对生成的文件进行网址及文本去重复,避免同一个频道出现在不同的类中
|
##############################################################对生成的文件进行网址及文本去重复,避免同一个频道出现在不同的类中
|
||||||
|
|
||||||
def remove_duplicates(input_file, output_file):
|
def remove_duplicates(input_file, output_file):
|
||||||
# 用于存储已经遇到的URL和包含genre的行
|
# 用于存储已经遇到的URL和包含genre的行
|
||||||
@@ -1042,7 +1048,7 @@ def remove_duplicates(input_file, output_file):
|
|||||||
if urls and urls[0] not in seen_urls:
|
if urls and urls[0] not in seen_urls:
|
||||||
seen_urls.add(urls[0])
|
seen_urls.add(urls[0])
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
||||||
if genre_line:
|
if genre_line:
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 将结果写入输出文件
|
# 将结果写入输出文件
|
||||||
@@ -1061,7 +1067,7 @@ with open('网络收集.txt', 'r', encoding="utf-8") as file:
|
|||||||
unique_lines = []
|
unique_lines = []
|
||||||
seen_lines = set()
|
seen_lines = set()
|
||||||
|
|
||||||
# 遍历每一行,如果是新的就加入unique_lines
|
# 遍历每一行,如果是新的就加入unique_lines
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line not in seen_lines:
|
if line not in seen_lines:
|
||||||
unique_lines.append(line)
|
unique_lines.append(line)
|
||||||
@@ -1075,7 +1081,7 @@ with open('网络收集.txt', 'w', encoding="utf-8") as file:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
################################################################################################任务结束,删除不必要的过程文件
|
################################################################################################任务结束,删除不必要的过程文件
|
||||||
files_to_remove = ['去重.txt', "2.txt", "a0.txt", "a.txt", "a1.txt", "b0.txt", "b.txt", "c.txt", "c1.txt", "c2.txt", "d.txt", "e.txt", "f0.txt", "f.txt", "f1.txt", "g0.txt", "g.txt", "g1.txt", "h0.txt", "h.txt", "h1.txt", "i.txt", \
|
files_to_remove = ['去重.txt', "2.txt", "a0.txt", "a.txt", "a1.txt", "b0.txt", "b.txt", "c.txt", "c1.txt", "c2.txt", "d.txt", "e.txt", "f0.txt", "f.txt", "f1.txt", "g0.txt", "g.txt", "g1.txt", "h0.txt", "h.txt", "h1.txt", "i.txt", \
|
||||||
"i1.txt", "j.txt", "j1.txt", "k.txt", "l0.txt", "l.txt", "l1.txt", "m.txt", "m1.txt", \
|
"i1.txt", "j.txt", "j1.txt", "k.txt", "l0.txt", "l.txt", "l1.txt", "m.txt", "m1.txt", \
|
||||||
"n0.txt","n.txt","n1.txt", "o1.txt", "o.txt", "p.txt"]
|
"n0.txt","n.txt","n1.txt", "o1.txt", "o.txt", "p.txt"]
|
||||||
@@ -1083,10 +1089,10 @@ files_to_remove = ['去重.txt', "2.txt", "a0.txt", "a.txt", "a1.txt", "b0.txt",
|
|||||||
for file in files_to_remove:
|
for file in files_to_remove:
|
||||||
if os.path.exists(file):
|
if os.path.exists(file):
|
||||||
os.remove(file)
|
os.remove(file)
|
||||||
else: # 如果文件不存在,则提示异常并打印提示信息
|
else: # 如果文件不存在,则提示异常并打印提示信息
|
||||||
print(f"文件 {file} 不存在,跳过删除。")
|
print(f"文件 {file} 不存在,跳过删除。")
|
||||||
|
|
||||||
print("任务运行完毕,gat频道列表可查看文件夹内综合源.txt文件!")
|
print("任务运行完毕,gat频道列表可查看文件夹内综合源.txt文件!")
|
||||||
|
|
||||||
|
|
||||||
def append_text_between_files(file1_path, file2_path):
|
def append_text_between_files(file1_path, file2_path):
|
||||||
@@ -1115,7 +1121,7 @@ file_path1 = '网络收集.txt'
|
|||||||
file_path2 = '综合源.txt'
|
file_path2 = '综合源.txt'
|
||||||
append_text_between_files(file_path1, file_path2)
|
append_text_between_files(file_path1, file_path2)
|
||||||
|
|
||||||
##############################################################对生成的文件进行网址及文本去重复,避免同一个频道出现在不同的类中
|
##############################################################对生成的文件进行网址及文本去重复,避免同一个频道出现在不同的类中
|
||||||
|
|
||||||
def remove_duplicates(input_file, output_file):
|
def remove_duplicates(input_file, output_file):
|
||||||
# 用于存储已经遇到的URL和包含genre的行
|
# 用于存储已经遇到的URL和包含genre的行
|
||||||
@@ -1136,7 +1142,7 @@ def remove_duplicates(input_file, output_file):
|
|||||||
if urls and urls[0] not in seen_urls:
|
if urls and urls[0] not in seen_urls:
|
||||||
seen_urls.add(urls[0])
|
seen_urls.add(urls[0])
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
||||||
if genre_line:
|
if genre_line:
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
# 将结果写入输出文件
|
# 将结果写入输出文件
|
||||||
@@ -1216,14 +1222,14 @@ with open(file_path, 'r+', encoding='utf-8') as f:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
################################################################################################任务结束,删除不必要的过程文件
|
################################################################################################任务结束,删除不必要的过程文件
|
||||||
files_to_remove = ["gat.txt", "汇总.txt"]
|
files_to_remove = ["gat.txt", "汇总.txt"]
|
||||||
for file in files_to_remove:
|
for file in files_to_remove:
|
||||||
if os.path.exists(file):
|
if os.path.exists(file):
|
||||||
os.remove(file)
|
os.remove(file)
|
||||||
else: # 如果文件不存在,则提示异常并打印提示信息
|
else: # 如果文件不存在,则提示异常并打印提示信息
|
||||||
print(f"文件 {file} 不存在,跳过删除。")
|
print(f"文件 {file} 不存在,跳过删除。")
|
||||||
print("任务运行完毕,频道列表可查看文件夹内源.txt文件!")
|
print("任务运行完毕,频道列表可查看文件夹内源.txt文件!")
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
@@ -1270,4 +1276,4 @@ delete_nonstandard_files()
|
|||||||
rename_standard_files()
|
rename_standard_files()
|
||||||
|
|
||||||
|
|
||||||
print("任务运行完毕,gat频道列表可查看文件夹内综合源.txt文件!")
|
print("任务运行完毕,gat频道列表可查看文件夹内综合源.txt文件!")
|
||||||
|
|||||||
Reference in New Issue
Block a user