#!/usr/bin/env python3 import urllib.request import urllib.parse import re import json import os import sys import tempfile import zipfile import argparse import concurrent.futures BASE_URL = "https://yiffer.xyz" PICS_BASE_URL = "https://pics.yiffer.xyz" def fetch_html(url): req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}) try: with urllib.request.urlopen(req) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: if e.code == 404: return None raise def parse_turbo_stream(html): """Extracts and parses the Remix turbo-stream JSON structure.""" if not html: return [] enqueues = re.findall(r'streamController\.enqueue\((.*?)\);', html, re.DOTALL) arr = [] for e in enqueues: try: # First loads decodes the JS string literal decoded_str = json.loads(e) # Second loads parses the JSON array string part_arr = json.loads(decoded_str) arr.extend(part_arr) except Exception: pass return arr def browse_comics(page=1, search=None): """Browse or search comics from yiffer.xyz.""" url = f"{BASE_URL}/browse?page={page}" if search: url += f"&search={urllib.parse.quote(search)}" print(f"Fetching: {url}") html = fetch_html(url) arr = parse_turbo_stream(html) try: comics_idx = arr.index("comicsAndAds") comics_arr = arr[comics_idx + 1] except ValueError: print("Error: Could not find comics listing in the stream.") return [], 0 try: id_key = f"_{arr.index('id')}" name_key = f"_{arr.index('name')}" pages_key = f"_{arr.index('numberOfPages')}" except ValueError: print("Error: Could not locate required keys in the stream.") return [], 0 total_comics = 0 try: tc_idx = arr.index("totalNumComics") val = arr[tc_idx + 1] if isinstance(val, int): total_comics = val except ValueError: pass comics = [] for ptr in comics_arr: item_obj = arr[ptr] if isinstance(item_obj, dict) and id_key in item_obj and name_key in item_obj: c_id = arr[item_obj[id_key]] c_name = arr[item_obj[name_key]] c_pages = arr[item_obj[pages_key]] if pages_key in item_obj else 0 comics.append({"id": c_id, "name": c_name, "pages": c_pages}) return comics, total_comics def get_comic_data(comic_name): """Get metadata and page tokens for a specific comic.""" url = f"{BASE_URL}/c/{urllib.parse.quote(comic_name)}" html = fetch_html(url) if not html: print(f"Comic '{comic_name}' not found.") return None arr = parse_turbo_stream(html) if not arr: print("Failed to parse comic stream.") return None try: id_idx = arr.index("id") comic_id = arr[id_idx + 1] except ValueError: print("Could not find comic ID.") return None try: name_idx = arr.index("name") comic_name_real = arr[name_idx + 1] except ValueError: comic_name_real = comic_name try: pages_idx = arr.index("pages") pages_arr = arr[pages_idx + 1] except ValueError: print("Could not find pages array.") return None try: token_str_idx = arr.index("token") token_key = f"_{token_str_idx}" except ValueError: print("Could not find token string in array.") return None tokens = [] for ptr in pages_arr: page_obj = arr[ptr] if token_key in page_obj: target_ptr = page_obj[token_key] tokens.append(arr[target_ptr]) return { "id": comic_id, "name": comic_name_real, "pages": tokens } def download_image(url, output_path): """Download an image from the given URL and save it to output_path.""" try: req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(req) as response: with open(output_path, 'wb') as f: f.write(response.read()) return True, "Success" except Exception as e: return False, str(e) def download_comic(comic_name, output_dir, max_workers=5): """Download all pages of a comic directly into a CBZ file.""" print(f"Fetching metadata for '{comic_name}'...") comic_data = get_comic_data(comic_name) if not comic_data: return c_id = comic_data['id'] c_name = comic_data['name'] tokens = comic_data['pages'] num_pages = len(tokens) print(f"Found comic: {c_name} (ID: {c_id}) with {num_pages} pages.") safe_name = re.sub(r'[\\/*?:"<>|]', "", c_name) os.makedirs(output_dir, exist_ok=True) cbz_filepath = os.path.join(output_dir, f"{safe_name}.cbz") if os.path.exists(cbz_filepath) and os.path.getsize(cbz_filepath) > 0: print(f"Skipping: {cbz_filepath} already exists.") return print(f"Downloading {num_pages} pages into temporary directory before zipping...") with tempfile.TemporaryDirectory() as tmp_dir: tasks = [] with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: for i, token in enumerate(tokens, 1): url = f"{PICS_BASE_URL}/comics/{c_id}/{token}.jpg" filename = f"{i:03d}.jpg" filepath = os.path.join(tmp_dir, filename) tasks.append( (i, num_pages, filename, executor.submit(download_image, url, filepath)) ) all_successful = True for i, num_pages, filename, future in tasks: success, msg = future.result() if success: print(f"[{i:03d}/{num_pages:03d}] Downloaded: {filename}") else: print(f"[{i:03d}/{num_pages:03d}] FAILED: {filename} - {msg}") all_successful = False if all_successful: print(f"Packaging into {cbz_filepath}...") with zipfile.ZipFile(cbz_filepath, 'w', zipfile.ZIP_STORED) as cbz: for i in range(1, num_pages + 1): filename = f"{i:03d}.jpg" filepath = os.path.join(tmp_dir, filename) cbz.write(filepath, arcname=filename) print(f"\nDownload complete! Saved to {cbz_filepath}") else: print("\nDownload incomplete due to failures. Skipping CBZ creation.") def main(): parser = argparse.ArgumentParser(description="Scraper for Yiffer.xyz comics") subparsers = parser.add_subparsers(dest="command", help="Command to run") # Browse/Search Command cmd_browse = subparsers.add_parser("browse", help="Browse or search for comics") cmd_browse.add_argument("--page", type=int, default=1, help="Page number to view (default 1)") cmd_browse.add_argument("--search", type=str, default="", help="Search query") # Download Command cmd_download = subparsers.add_parser("download", help="Download a specific comic by name") cmd_download.add_argument("name", type=str, help="Exact name of the comic to download (case-insensitive, exact match on URL)") cmd_download.add_argument("--output", type=str, default="downloads", help="Output directory to save the comic (default 'downloads')") cmd_download.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)") # Bulk Download Command cmd_recent = subparsers.add_parser("download-recent", help="Download a batch of the most recent comics") cmd_recent.add_argument("--count", type=int, default=5, help="Number of recent comics to download (default 5)") cmd_recent.add_argument("--output", type=str, default="downloads", help="Output directory to save the comics") cmd_recent.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)") args = parser.parse_args() if args.command == "browse": print(f"--- Yiffer.xyz Browse (Page {args.page}) {'Search: ' + args.search if args.search else ''} ---") comics, total = browse_comics(args.page, args.search) total = int(total) if total else 0 if total > 0: print(f"Total comics found: {total}") print("-" * 60) for i, c in enumerate(comics, 1): print(f"{i:2d}. {c['name']} ({c['pages']} pages) [ID: {c['id']}]") print("-" * 60) elif args.command == "download": download_comic(args.name, args.output, args.workers) elif args.command == "download-recent": # Always fetch page 1 to get the most recent ones print(f"Fetching the {args.count} most recent comics...") comics, _ = browse_comics(1, "") if not comics: print("Could not retrieve comics list.") return comics_to_download = comics[:args.count] for idx, comic in enumerate(comics_to_download, 1): print(f"\n--- [{idx}/{len(comics_to_download)}] Downloading: {comic['name']} ---") download_comic(comic['name'], args.output, args.workers) else: parser.print_help() if __name__ == "__main__": main()