Update 网络收集.py
This commit is contained in:
+35
-3
@@ -412,7 +412,7 @@ with open(file_path, 'r', encoding='utf-8') as file:
|
|||||||
# 获取总行数用于进度条
|
# 获取总行数用于进度条
|
||||||
total_lines = len(lines)
|
total_lines = len(lines)
|
||||||
# 写入通过检测的行到新文件
|
# 写入通过检测的行到新文件
|
||||||
with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
with open(output_file_path, 'a', encoding='utf-8') as output_file:
|
||||||
# 使用tqdm显示进度条
|
# 使用tqdm显示进度条
|
||||||
for i, line in tqdm(enumerate(lines), total=total_lines, desc="Processing", unit='line'):
|
for i, line in tqdm(enumerate(lines), total=total_lines, desc="Processing", unit='line'):
|
||||||
# 检查是否包含 'genre'
|
# 检查是否包含 'genre'
|
||||||
@@ -438,7 +438,7 @@ with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
frame_count = 0
|
frame_count = 0
|
||||||
# 尝试捕获4秒内的帧
|
# 尝试捕获4秒内的帧
|
||||||
while frame_count < 60 and (time.time() - start_time) < 3:
|
while frame_count < 50 and (time.time() - start_time) < 3:
|
||||||
ret, frame = cap.read()
|
ret, frame = cap.read()
|
||||||
if not ret:
|
if not ret:
|
||||||
break
|
break
|
||||||
@@ -447,7 +447,7 @@ with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
|||||||
cap.release()
|
cap.release()
|
||||||
|
|
||||||
# 根据捕获的帧数判断状态并记录结果
|
# 根据捕获的帧数判断状态并记录结果
|
||||||
if frame_count >= 60: # 6秒内超过25帧则写入
|
if frame_count >= 50: # 6秒内超过25帧则写入
|
||||||
detected_ips[ip_key] = {'status': 'ok'}
|
detected_ips[ip_key] = {'status': 'ok'}
|
||||||
output_file.write(line) # 写入检测通过的行
|
output_file.write(line) # 写入检测通过的行
|
||||||
else:
|
else:
|
||||||
@@ -458,6 +458,38 @@ for ip_key, result in detected_ips.items():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def remove_duplicates(input_file, output_file):
|
||||||
|
# 用于存储已经遇到的URL和包含genre的行
|
||||||
|
seen_urls = set()
|
||||||
|
seen_lines_with_genre = set()
|
||||||
|
# 用于存储最终输出的行
|
||||||
|
output_lines = []
|
||||||
|
# 打开输入文件并读取所有行
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
print("去重前的行数:", len(lines))
|
||||||
|
# 遍历每一行
|
||||||
|
for line in lines:
|
||||||
|
# 使用正则表达式查找URL和包含genre的行,默认最后一行
|
||||||
|
urls = re.findall(r'[https]?[http]?[P2p]?[mitv]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
|
||||||
|
genre_line = re.search(r'\bgenre\b', line, re.IGNORECASE) is not None
|
||||||
|
# 如果找到URL并且该URL尚未被记录
|
||||||
|
if urls and urls[0] not in seen_urls:
|
||||||
|
seen_urls.add(urls[0])
|
||||||
|
output_lines.append(line)
|
||||||
|
# 如果找到包含genre的行,无论是否已被记录,都写入新文件
|
||||||
|
if genre_line:
|
||||||
|
output_lines.append(line)
|
||||||
|
# 将结果写入输出文件
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.writelines(output_lines)
|
||||||
|
print("去重后的行数:", len(output_lines))
|
||||||
|
# 使用方法
|
||||||
|
remove_duplicates('网络收集.txt', '网络收集.txt')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############################################################################全部检测,防止IP段失效
|
############################################################################全部检测,防止IP段失效
|
||||||
import requests
|
import requests
|
||||||
|
|||||||
Reference in New Issue
Block a user