import sys, re, requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from base.spider import Spider requests.packages.urllib3.disable_warnings() class Spider(Spider): def getName(self): return "Jable" def init(self, extend=""): self.siteUrl = "https://jable.tv" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Referer": "https://jable.tv/", } self.sess = requests.Session() self.sess.mount('https://', HTTPAdapter(max_retries=Retry(total=3, status_forcelist=[500, 502, 503, 504]))) def fetch(self, url): try: return self.sess.get(url, headers=self.headers, timeout=15, verify=False) except: return None def homeContent(self, filter): r = self.fetch(self.siteUrl) cats = [] if r and r.ok: # 修复优化:放弃失效的 class="tag",改为直接抓取带有 categories/tags/hot 等真实路径的 A 标签 pattern = r'href=["\'](?:https://jable\.tv)?/((?:categories|tags)/[^"\'/]+|latest-updates|hot)/?["\'][^>]*>(.*?)' for m in re.finditer(pattern, r.text, re.I): tid = m.group(1).strip('/') name = re.sub(r'<[^>]+>', '', m.group(2)).strip() # 过滤掉空值及重复项,确保抓取到的分类合法 if tid and name and len(name) > 0 and not name.isspace() and tid not in [c['type_id'] for c in cats]: cats.append({"type_id": tid, "type_name": name}) # 兜底静态分类优化:增加常用分类,以防极端网络情况下首页解析为空 if not cats: cats = [ {"type_id": "latest-updates", "type_name": "最近更新"}, {"type_id": "hot", "type_name": "热门影片"}, {"type_id": "categories/chinese-subtitle", "type_name": "中文字幕"}, {"type_id": "categories/uncensored", "type_name": "無碼"}, {"type_id": "categories/lesbian", "type_name": "女同"}, {"type_id": "categories/creampie", "type_name": "中出"} ] return {'class': cats} def categoryContent(self, tid, pg, filter, extend): url = f"{self.siteUrl}/{tid}/{pg}/" if str(pg) != '1' else f"{self.siteUrl}/{tid}/" return self.postList(url, int(pg)) def searchContent(self, key, quick, pg=1): url = f"{self.siteUrl}/search/{key}/{pg}/" if str(pg) != '1' else f"{self.siteUrl}/search/{key}/" return self.postList(url, int(pg)) def postList(self, url, pg): r = self.fetch(url) l = [] if r and r.ok: blocks = r.text.split('