From 3048473c66b4ea5bf43460b8faeec80db5f2a5c5 Mon Sep 17 00:00:00 2001 From: morpheus Date: Sun, 15 Mar 2026 22:13:52 -0300 Subject: [PATCH] Initial commit of Yiffer CLI scraper (Termux compatible) with CBZ support --- README.md | 66 ++++++++++++ yiffer_scraper.py | 266 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 README.md create mode 100755 yiffer_scraper.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..b771498 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# Yiffer Scraper + +A pure Python scraper to download comics from yiffer.xyz, automatically packing them into `.cbz` files. + +It is completely written using Python's standard library, meaning it works out-of-the-box anywhere Python 3 is installed—including on Android via Termux! + +## Features +- **Browse & Search:** Search for comics directly from the terminal. +- **Concurrent Downloads:** Download pages in parallel. +- **CBZ Export:** Packages downloads directly into CBZ format. +- **Termux Compatible:** Requires zero third-party packages (no `pip install requests` necessary). + +## Installation & Usage (All Platforms) + +You only need Python 3 installed. If you're on Linux, macOS, or Windows, simply download `yiffer_scraper.py` and run it: + +```bash +python3 yiffer_scraper.py browse +python3 yiffer_scraper.py download "Comic Name" +``` + +## Termux Setup (Android) + +If you are using Termux on Android, setup is extremely simple: + +1. Install Python: +```bash +pkg update && pkg install python -y +``` + +2. Make the script executable: +```bash +chmod +x yiffer_scraper.py +``` + +3. Run the script: +```bash +./yiffer_scraper.py --help +``` + +## Commands + +### Browse +View the latest comics on the site: +```bash +./yiffer_scraper.py browse +./yiffer_scraper.py browse --page 2 +``` + +### Search +Search for specific comics: +```bash +./yiffer_scraper.py browse --search "Bifurcation" +``` + +### Download +Download a specific comic by exact name: +```bash +./yiffer_scraper.py download "Bifurcation" --output ./my_comics +``` + +### Download Recent (Bulk) +Download the latest `N` comics from the homepage automatically: +```bash +./yiffer_scraper.py download-recent --count 10 --output ./my_comics +``` diff --git a/yiffer_scraper.py b/yiffer_scraper.py new file mode 100755 index 0000000..59728cc --- /dev/null +++ b/yiffer_scraper.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +import urllib.request +import urllib.parse +import re +import json +import os +import sys +import tempfile +import zipfile +import argparse +import concurrent.futures + +BASE_URL = "https://yiffer.xyz" +PICS_BASE_URL = "https://pics.yiffer.xyz" + +def fetch_html(url): + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}) + try: + with urllib.request.urlopen(req) as response: + return response.read().decode('utf-8') + except urllib.error.HTTPError as e: + if e.code == 404: + return None + raise + +def parse_turbo_stream(html): + """Extracts and parses the Remix turbo-stream JSON structure.""" + if not html: + return [] + + enqueues = re.findall(r'streamController\.enqueue\((.*?)\);', html, re.DOTALL) + arr = [] + for e in enqueues: + try: + # First loads decodes the JS string literal + decoded_str = json.loads(e) + # Second loads parses the JSON array string + part_arr = json.loads(decoded_str) + arr.extend(part_arr) + except Exception: + pass + return arr + +def browse_comics(page=1, search=None): + """Browse or search comics from yiffer.xyz.""" + url = f"{BASE_URL}/browse?page={page}" + if search: + url += f"&search={urllib.parse.quote(search)}" + + print(f"Fetching: {url}") + html = fetch_html(url) + arr = parse_turbo_stream(html) + + try: + comics_idx = arr.index("comicsAndAds") + comics_arr = arr[comics_idx + 1] + except ValueError: + print("Error: Could not find comics listing in the stream.") + return [], 0 + + try: + id_key = f"_{arr.index('id')}" + name_key = f"_{arr.index('name')}" + pages_key = f"_{arr.index('numberOfPages')}" + except ValueError: + print("Error: Could not locate required keys in the stream.") + return [], 0 + + total_comics = 0 + try: + tc_idx = arr.index("totalNumComics") + val = arr[tc_idx + 1] + if isinstance(val, int): + total_comics = val + except ValueError: + pass + + comics = [] + for ptr in comics_arr: + item_obj = arr[ptr] + if isinstance(item_obj, dict) and id_key in item_obj and name_key in item_obj: + c_id = arr[item_obj[id_key]] + c_name = arr[item_obj[name_key]] + c_pages = arr[item_obj[pages_key]] if pages_key in item_obj else 0 + comics.append({"id": c_id, "name": c_name, "pages": c_pages}) + + return comics, total_comics + +def get_comic_data(comic_name): + """Get metadata and page tokens for a specific comic.""" + url = f"{BASE_URL}/c/{urllib.parse.quote(comic_name)}" + html = fetch_html(url) + + if not html: + print(f"Comic '{comic_name}' not found.") + return None + + arr = parse_turbo_stream(html) + if not arr: + print("Failed to parse comic stream.") + return None + + try: + id_idx = arr.index("id") + comic_id = arr[id_idx + 1] + except ValueError: + print("Could not find comic ID.") + return None + + try: + name_idx = arr.index("name") + comic_name_real = arr[name_idx + 1] + except ValueError: + comic_name_real = comic_name + + try: + pages_idx = arr.index("pages") + pages_arr = arr[pages_idx + 1] + except ValueError: + print("Could not find pages array.") + return None + + try: + token_str_idx = arr.index("token") + token_key = f"_{token_str_idx}" + except ValueError: + print("Could not find token string in array.") + return None + + tokens = [] + for ptr in pages_arr: + page_obj = arr[ptr] + if token_key in page_obj: + target_ptr = page_obj[token_key] + tokens.append(arr[target_ptr]) + + return { + "id": comic_id, + "name": comic_name_real, + "pages": tokens + } + +def download_image(url, output_path): + """Download an image from the given URL and save it to output_path.""" + try: + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + with urllib.request.urlopen(req) as response: + with open(output_path, 'wb') as f: + f.write(response.read()) + return True, "Success" + except Exception as e: + return False, str(e) + +def download_comic(comic_name, output_dir, max_workers=5): + """Download all pages of a comic directly into a CBZ file.""" + print(f"Fetching metadata for '{comic_name}'...") + comic_data = get_comic_data(comic_name) + + if not comic_data: + return + + c_id = comic_data['id'] + c_name = comic_data['name'] + tokens = comic_data['pages'] + num_pages = len(tokens) + + print(f"Found comic: {c_name} (ID: {c_id}) with {num_pages} pages.") + + safe_name = re.sub(r'[\\/*?:"<>|]', "", c_name) + os.makedirs(output_dir, exist_ok=True) + cbz_filepath = os.path.join(output_dir, f"{safe_name}.cbz") + + if os.path.exists(cbz_filepath) and os.path.getsize(cbz_filepath) > 0: + print(f"Skipping: {cbz_filepath} already exists.") + return + + print(f"Downloading {num_pages} pages into temporary directory before zipping...") + + with tempfile.TemporaryDirectory() as tmp_dir: + tasks = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + for i, token in enumerate(tokens, 1): + url = f"{PICS_BASE_URL}/comics/{c_id}/{token}.jpg" + filename = f"{i:03d}.jpg" + filepath = os.path.join(tmp_dir, filename) + + tasks.append( + (i, num_pages, filename, executor.submit(download_image, url, filepath)) + ) + + all_successful = True + for i, num_pages, filename, future in tasks: + success, msg = future.result() + if success: + print(f"[{i:03d}/{num_pages:03d}] Downloaded: {filename}") + else: + print(f"[{i:03d}/{num_pages:03d}] FAILED: {filename} - {msg}") + all_successful = False + + if all_successful: + print(f"Packaging into {cbz_filepath}...") + with zipfile.ZipFile(cbz_filepath, 'w', zipfile.ZIP_STORED) as cbz: + for i in range(1, num_pages + 1): + filename = f"{i:03d}.jpg" + filepath = os.path.join(tmp_dir, filename) + cbz.write(filepath, arcname=filename) + print(f"\nDownload complete! Saved to {cbz_filepath}") + else: + print("\nDownload incomplete due to failures. Skipping CBZ creation.") + + +def main(): + parser = argparse.ArgumentParser(description="Scraper for Yiffer.xyz comics") + subparsers = parser.add_subparsers(dest="command", help="Command to run") + + # Browse/Search Command + cmd_browse = subparsers.add_parser("browse", help="Browse or search for comics") + cmd_browse.add_argument("--page", type=int, default=1, help="Page number to view (default 1)") + cmd_browse.add_argument("--search", type=str, default="", help="Search query") + + # Download Command + cmd_download = subparsers.add_parser("download", help="Download a specific comic by name") + cmd_download.add_argument("name", type=str, help="Exact name of the comic to download (case-insensitive, exact match on URL)") + cmd_download.add_argument("--output", type=str, default="downloads", help="Output directory to save the comic (default 'downloads')") + cmd_download.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)") + + # Bulk Download Command + cmd_recent = subparsers.add_parser("download-recent", help="Download a batch of the most recent comics") + cmd_recent.add_argument("--count", type=int, default=5, help="Number of recent comics to download (default 5)") + cmd_recent.add_argument("--output", type=str, default="downloads", help="Output directory to save the comics") + cmd_recent.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)") + + args = parser.parse_args() + + if args.command == "browse": + print(f"--- Yiffer.xyz Browse (Page {args.page}) {'Search: ' + args.search if args.search else ''} ---") + comics, total = browse_comics(args.page, args.search) + total = int(total) if total else 0 + if total > 0: + print(f"Total comics found: {total}") + print("-" * 60) + for i, c in enumerate(comics, 1): + print(f"{i:2d}. {c['name']} ({c['pages']} pages) [ID: {c['id']}]") + print("-" * 60) + + elif args.command == "download": + download_comic(args.name, args.output, args.workers) + + elif args.command == "download-recent": + # Always fetch page 1 to get the most recent ones + print(f"Fetching the {args.count} most recent comics...") + comics, _ = browse_comics(1, "") + if not comics: + print("Could not retrieve comics list.") + return + + comics_to_download = comics[:args.count] + for idx, comic in enumerate(comics_to_download, 1): + print(f"\n--- [{idx}/{len(comics_to_download)}] Downloading: {comic['name']} ---") + download_comic(comic['name'], args.output, args.workers) + + else: + parser.print_help() + +if __name__ == "__main__": + main()