From 3048473c66b4ea5bf43460b8faeec80db5f2a5c5 Mon Sep 17 00:00:00 2001
From: morpheus <morpheus@morpheusnox.shop>
Date: Sun, 15 Mar 2026 22:13:52 -0300
Subject: [PATCH] Initial commit of Yiffer CLI scraper (Termux compatible) with
 CBZ support

---
 README.md         |  66 ++++++++++++
 yiffer_scraper.py | 266 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 332 insertions(+)
 create mode 100644 README.md
 create mode 100755 yiffer_scraper.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b771498
--- /dev/null
+++ b/README.md
@@ -0,0 +1,66 @@
+# Yiffer Scraper
+
+A pure Python scraper to download comics from yiffer.xyz, automatically packing them into `.cbz` files. 
+
+It is completely written using Python's standard library, meaning it works out-of-the-box anywhere Python 3 is installed—including on Android via Termux!
+
+## Features
+- **Browse & Search:** Search for comics directly from the terminal.
+- **Concurrent Downloads:** Download pages in parallel.
+- **CBZ Export:** Packages downloads directly into CBZ format.
+- **Termux Compatible:** Requires zero third-party packages (no `pip install requests` necessary).
+
+## Installation & Usage (All Platforms)
+
+You only need Python 3 installed. If you're on Linux, macOS, or Windows, simply download `yiffer_scraper.py` and run it:
+
+```bash
+python3 yiffer_scraper.py browse
+python3 yiffer_scraper.py download "Comic Name"
+```
+
+## Termux Setup (Android)
+
+If you are using Termux on Android, setup is extremely simple:
+
+1. Install Python:
+```bash
+pkg update && pkg install python -y
+```
+
+2. Make the script executable:
+```bash
+chmod +x yiffer_scraper.py
+```
+
+3. Run the script:
+```bash
+./yiffer_scraper.py --help
+```
+
+## Commands
+
+### Browse
+View the latest comics on the site:
+```bash
+./yiffer_scraper.py browse
+./yiffer_scraper.py browse --page 2
+```
+
+### Search
+Search for specific comics:
+```bash
+./yiffer_scraper.py browse --search "Bifurcation"
+```
+
+### Download
+Download a specific comic by exact name:
+```bash
+./yiffer_scraper.py download "Bifurcation" --output ./my_comics
+```
+
+### Download Recent (Bulk)
+Download the latest `N` comics from the homepage automatically:
+```bash
+./yiffer_scraper.py download-recent --count 10 --output ./my_comics
+```
diff --git a/yiffer_scraper.py b/yiffer_scraper.py
new file mode 100755
index 0000000..59728cc
--- /dev/null
+++ b/yiffer_scraper.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+import urllib.request
+import urllib.parse
+import re
+import json
+import os
+import sys
+import tempfile
+import zipfile
+import argparse
+import concurrent.futures
+
+BASE_URL = "https://yiffer.xyz"
+PICS_BASE_URL = "https://pics.yiffer.xyz"
+
+def fetch_html(url):
+    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
+    try:
+        with urllib.request.urlopen(req) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            return None
+        raise
+
+def parse_turbo_stream(html):
+    """Extracts and parses the Remix turbo-stream JSON structure."""
+    if not html:
+        return []
+    
+    enqueues = re.findall(r'streamController\.enqueue\((.*?)\);', html, re.DOTALL)
+    arr = []
+    for e in enqueues:
+        try:
+            # First loads decodes the JS string literal
+            decoded_str = json.loads(e)
+            # Second loads parses the JSON array string
+            part_arr = json.loads(decoded_str)
+            arr.extend(part_arr)
+        except Exception:
+            pass
+    return arr
+
+def browse_comics(page=1, search=None):
+    """Browse or search comics from yiffer.xyz."""
+    url = f"{BASE_URL}/browse?page={page}"
+    if search:
+        url += f"&search={urllib.parse.quote(search)}"
+        
+    print(f"Fetching: {url}")
+    html = fetch_html(url)
+    arr = parse_turbo_stream(html)
+
+    try:
+        comics_idx = arr.index("comicsAndAds")
+        comics_arr = arr[comics_idx + 1]
+    except ValueError:
+        print("Error: Could not find comics listing in the stream.")
+        return [], 0
+
+    try:
+        id_key = f"_{arr.index('id')}"
+        name_key = f"_{arr.index('name')}"
+        pages_key = f"_{arr.index('numberOfPages')}"
+    except ValueError:
+        print("Error: Could not locate required keys in the stream.")
+        return [], 0
+        
+    total_comics = 0
+    try:
+        tc_idx = arr.index("totalNumComics")
+        val = arr[tc_idx + 1]
+        if isinstance(val, int):
+            total_comics = val
+    except ValueError:
+        pass
+
+    comics = []
+    for ptr in comics_arr:
+        item_obj = arr[ptr]
+        if isinstance(item_obj, dict) and id_key in item_obj and name_key in item_obj:
+            c_id = arr[item_obj[id_key]]
+            c_name = arr[item_obj[name_key]]
+            c_pages = arr[item_obj[pages_key]] if pages_key in item_obj else 0
+            comics.append({"id": c_id, "name": c_name, "pages": c_pages})
+
+    return comics, total_comics
+
+def get_comic_data(comic_name):
+    """Get metadata and page tokens for a specific comic."""
+    url = f"{BASE_URL}/c/{urllib.parse.quote(comic_name)}"
+    html = fetch_html(url)
+    
+    if not html:
+        print(f"Comic '{comic_name}' not found.")
+        return None
+        
+    arr = parse_turbo_stream(html)
+    if not arr:
+        print("Failed to parse comic stream.")
+        return None
+
+    try:
+        id_idx = arr.index("id")
+        comic_id = arr[id_idx + 1]
+    except ValueError:
+        print("Could not find comic ID.")
+        return None
+
+    try:
+        name_idx = arr.index("name")
+        comic_name_real = arr[name_idx + 1]
+    except ValueError:
+        comic_name_real = comic_name
+
+    try:
+        pages_idx = arr.index("pages")
+        pages_arr = arr[pages_idx + 1]
+    except ValueError:
+        print("Could not find pages array.")
+        return None
+
+    try:
+        token_str_idx = arr.index("token")
+        token_key = f"_{token_str_idx}"
+    except ValueError:
+        print("Could not find token string in array.")
+        return None
+
+    tokens = []
+    for ptr in pages_arr:
+        page_obj = arr[ptr]
+        if token_key in page_obj:
+            target_ptr = page_obj[token_key]
+            tokens.append(arr[target_ptr])
+
+    return {
+        "id": comic_id, 
+        "name": comic_name_real, 
+        "pages": tokens
+    }
+
+def download_image(url, output_path):
+    """Download an image from the given URL and save it to output_path."""
+    try:
+        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+        with urllib.request.urlopen(req) as response:
+            with open(output_path, 'wb') as f:
+                f.write(response.read())
+        return True, "Success"
+    except Exception as e:
+        return False, str(e)
+
+def download_comic(comic_name, output_dir, max_workers=5):
+    """Download all pages of a comic directly into a CBZ file."""
+    print(f"Fetching metadata for '{comic_name}'...")
+    comic_data = get_comic_data(comic_name)
+    
+    if not comic_data:
+        return
+        
+    c_id = comic_data['id']
+    c_name = comic_data['name']
+    tokens = comic_data['pages']
+    num_pages = len(tokens)
+    
+    print(f"Found comic: {c_name} (ID: {c_id}) with {num_pages} pages.")
+    
+    safe_name = re.sub(r'[\\/*?:"<>|]', "", c_name)
+    os.makedirs(output_dir, exist_ok=True)
+    cbz_filepath = os.path.join(output_dir, f"{safe_name}.cbz")
+    
+    if os.path.exists(cbz_filepath) and os.path.getsize(cbz_filepath) > 0:
+        print(f"Skipping: {cbz_filepath} already exists.")
+        return
+        
+    print(f"Downloading {num_pages} pages into temporary directory before zipping...")
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tasks = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            for i, token in enumerate(tokens, 1):
+                url = f"{PICS_BASE_URL}/comics/{c_id}/{token}.jpg"
+                filename = f"{i:03d}.jpg"
+                filepath = os.path.join(tmp_dir, filename)
+                
+                tasks.append(
+                    (i, num_pages, filename, executor.submit(download_image, url, filepath))
+                )
+                
+            all_successful = True
+            for i, num_pages, filename, future in tasks:
+                success, msg = future.result()
+                if success:
+                    print(f"[{i:03d}/{num_pages:03d}] Downloaded: {filename}")
+                else:
+                    print(f"[{i:03d}/{num_pages:03d}] FAILED: {filename} - {msg}")
+                    all_successful = False
+                    
+        if all_successful:
+            print(f"Packaging into {cbz_filepath}...")
+            with zipfile.ZipFile(cbz_filepath, 'w', zipfile.ZIP_STORED) as cbz:
+                for i in range(1, num_pages + 1):
+                    filename = f"{i:03d}.jpg"
+                    filepath = os.path.join(tmp_dir, filename)
+                    cbz.write(filepath, arcname=filename)
+            print(f"\nDownload complete! Saved to {cbz_filepath}")
+        else:
+            print("\nDownload incomplete due to failures. Skipping CBZ creation.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Scraper for Yiffer.xyz comics")
+    subparsers = parser.add_subparsers(dest="command", help="Command to run")
+
+    # Browse/Search Command
+    cmd_browse = subparsers.add_parser("browse", help="Browse or search for comics")
+    cmd_browse.add_argument("--page", type=int, default=1, help="Page number to view (default 1)")
+    cmd_browse.add_argument("--search", type=str, default="", help="Search query")
+
+    # Download Command
+    cmd_download = subparsers.add_parser("download", help="Download a specific comic by name")
+    cmd_download.add_argument("name", type=str, help="Exact name of the comic to download (case-insensitive, exact match on URL)")
+    cmd_download.add_argument("--output", type=str, default="downloads", help="Output directory to save the comic (default 'downloads')")
+    cmd_download.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)")
+
+    # Bulk Download Command
+    cmd_recent = subparsers.add_parser("download-recent", help="Download a batch of the most recent comics")
+    cmd_recent.add_argument("--count", type=int, default=5, help="Number of recent comics to download (default 5)")
+    cmd_recent.add_argument("--output", type=str, default="downloads", help="Output directory to save the comics")
+    cmd_recent.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)")
+
+    args = parser.parse_args()
+
+    if args.command == "browse":
+        print(f"--- Yiffer.xyz Browse (Page {args.page}) {'Search: ' + args.search if args.search else ''} ---")
+        comics, total = browse_comics(args.page, args.search)
+        total = int(total) if total else 0
+        if total > 0:
+            print(f"Total comics found: {total}")
+        print("-" * 60)
+        for i, c in enumerate(comics, 1):
+            print(f"{i:2d}. {c['name']} ({c['pages']} pages) [ID: {c['id']}]")
+        print("-" * 60)
+    
+    elif args.command == "download":
+        download_comic(args.name, args.output, args.workers)
+
+    elif args.command == "download-recent":
+        # Always fetch page 1 to get the most recent ones
+        print(f"Fetching the {args.count} most recent comics...")
+        comics, _ = browse_comics(1, "")
+        if not comics:
+            print("Could not retrieve comics list.")
+            return
+            
+        comics_to_download = comics[:args.count]
+        for idx, comic in enumerate(comics_to_download, 1):
+            print(f"\n--- [{idx}/{len(comics_to_download)}] Downloading: {comic['name']} ---")
+            download_comic(comic['name'], args.output, args.workers)
+
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()