Files
tvbox-1/py/sinparty.py
T
2026-02-27 02:19:32 +08:00

153 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import asyncio
import aiohttp
import re
from playwright.async_api import async_playwright
async def fetch_m3u8(session: aiohttp.ClientSession, name: str, link: str):
"""
使用 aiohttp 高并发拉取单个直播间源码,并使用正则嗅探底层 .m3u8 流媒体链接
优化逻辑:保持底层并发稳定,避免 GitHub Actions 中 OOM
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
try:
async with session.get(link, headers=headers, timeout=15) as response:
if response.status == 200:
text = await response.text()
match = re.search(r'(https?:[\\/]+[^"\'\s]+\.m3u8[^"\'\s]*)', text)
if match:
m3u8_url = match.group(1).replace('\\/', '/')
# 【可视化新增】:实时输出成功嗅探到底层流的数据
print(f" [√ 流捕获] {name.ljust(15)} -> {m3u8_url}")
return name, m3u8_url, link
except Exception:
pass
# 【可视化新增】:实时输出未命中底层流,降级处理的数据
print(f" [- 未嗅探] {name.ljust(15)} -> 未发现直接 m3u8,保留原始跳转链接")
return name, None, link
async def main():
results = []
# === 阶段 1Playwright 全站分页抓取 ===
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(ignore_https_errors=True)
# 注入反爬虫绕过脚本,抹除自动化特征,欺骗防御蜘蛛的探测
await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
print("\n[*] 已挂载 Stealth 反检测脚本,webdriver 特征已抹除。")
page_num = 1
while True:
print(f"\n{'='*50}")
print(f"[*] 正在加载并抓取第 {page_num} 页数据... (URL: https://sinparty.com/?page={page_num})")
# 动态改变 page=&& 参数
url = f"https://sinparty.com/?page={page_num}"
# 移除不可靠的 networkidle,使用默认导航机制
await page.goto(url)
# 强制屏蔽拦截遮罩,破坏防御弹窗,恢复“可操作、可活动、可点击”状态
await page.add_style_tag(content='''
.app-modal__overlay, .modal-auth__inner { display: none !important; z-index: -9999 !important; }
body, html { pointer-events: auto !important; overflow: auto !important; user-select: auto !important; }
''')
# 使用 JS 直接从 DOM 树中强制物理删除这两个拦截节点
await page.evaluate('''() => {
document.querySelectorAll('.app-modal__overlay, .modal-auth__inner').forEach(el => el.remove());
}''')
print("[*] 防御拦截层 (.app-modal__overlay) 已被强制物理摧毁,DOM 可点击限制已解除。")
# 定位目标:跳过 skeleton 骨架屏,直接锁定在线主播节点
try:
# 显式等待真实数据的 CSS 节点渲染到 DOM 中(最长容忍 20 秒)
# 统一 <div class="content-gallery content-gallery--live-listing"> 数组 和 <div class="content-gallery__item">
print("[*] 正在等待目标外层节点渲染: .content-gallery--live-listing .content-gallery__item")
await page.wait_for_selector(".content-gallery--live-listing .content-gallery__item", timeout=20000)
except Exception:
# 如果 20 秒后目标节点仍未出现,说明确实到达了没有数据的最后一页
print(f"[!] 第 {page_num} 页未检测到有效在线主播数据,翻页结束。")
break
# 此时 DOM 中必定已有数据,安全执行并集提取
# 每二次数组截胡 <div class="content-gallery__item">
elements = await page.locator(".content-gallery--live-listing .content-gallery__item").all()
print(f"[*] 成功截胡当前页卡片数组,共包含 {len(elements)} 个目标节点。开始提取详细数据...")
for element in elements:
# 抓取标题与名字:兼容 .cam-tile__title 或 .cam-tile__details
title_loc = element.locator(".cam-tile__title")
if await title_loc.count() > 0:
title = await title_loc.first.inner_text()
else:
title = "未知用户"
# 抓取 href 跳转链接:对应 class="cam-tile" 等于跳转链接
a_loc = element.locator("a.cam-tile")
if await a_loc.count() > 0:
href = await a_loc.first.get_attribute("href")
else:
href = ""
if href:
if href.startswith("/"):
href = f"https://sinparty.com{href}"
# 【可视化新增】:实时输出当前从 DOM 树中剥离出来的详细键值对
print(f" [+ 数据提取] 节点: a.cam-tile | 标题: {title.strip().ljust(15)} | 跳转链: {href}")
results.append({
"name": title.strip(),
"link": href
})
page_num += 1
await browser.close()
# === 阶段 2AIOHTTP 高性能并发抓取 m3u8 流 ===
print(f"\n{'='*50}")
print(f"[*] 全站遍历完毕,共提取 {len(results)} 个直播间链接。")
print("[*] 启动 aiohttp 协程池 (TCPConnector limit=100),开始高并发底层嗅探...")
print(f"{'='*50}\n")
connector = aiohttp.TCPConnector(limit=100, ssl=False)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = [fetch_m3u8(session, res["name"], res["link"]) for res in results]
m3u8_results = await asyncio.gather(*tasks)
# === 阶段 3:转换并格式化输出 M3U ===
m3u_lines = ["#EXTM3U"]
success_count = 0
for name, m3u8_url, room_link in m3u8_results:
# M3U 标准:需要直接填入可播放的流媒体链接 (.m3u8)
# 如果底层抓不到 m3u8,则使用原始跳转链接作为 fallback
final_link = m3u8_url if m3u8_url else room_link
# 按照要求输出 group-title="女生" 及名称
m3u_lines.append(f'#EXTINF:-1 group-title="女生",{name}')
m3u_lines.append(final_link)
if m3u8_url:
success_count += 1
m3u_content = "\n".join(m3u_lines)
print(f"\n{'='*50}")
print("=== 转换格式 M3U 输出 ===")
print(m3u_content)
print(f"{'='*50}")
with open("lib/party.m3u", "w", encoding="utf-8") as f:
f.write(m3u_content)
print(f"\n[*] 并发处理彻底完成!")
print(f"[*] 成功解析并提取 {success_count} 个底层视频流。")
print(f"[*] 总计写入 {len(results)} 条格式化数据至 lib/party.m3u。")
if __name__ == "__main__":
asyncio.run(main())