Initial commit of Yiffer CLI scraper (Termux compatible) with CBZ support

This commit is contained in:
morpheus
2026-03-15 22:13:52 -03:00
commit 3048473c66
2 changed files with 332 additions and 0 deletions

66
README.md Normal file
View File

@@ -0,0 +1,66 @@
# Yiffer Scraper
A pure Python scraper to download comics from yiffer.xyz, automatically packing them into `.cbz` files.
It is completely written using Python's standard library, meaning it works out-of-the-box anywhere Python 3 is installed—including on Android via Termux!
## Features
- **Browse & Search:** Search for comics directly from the terminal.
- **Concurrent Downloads:** Download pages in parallel.
- **CBZ Export:** Packages downloads directly into CBZ format.
- **Termux Compatible:** Requires zero third-party packages (no `pip install requests` necessary).
## Installation & Usage (All Platforms)
You only need Python 3 installed. If you're on Linux, macOS, or Windows, simply download `yiffer_scraper.py` and run it:
```bash
python3 yiffer_scraper.py browse
python3 yiffer_scraper.py download "Comic Name"
```
## Termux Setup (Android)
If you are using Termux on Android, setup is extremely simple:
1. Install Python:
```bash
pkg update && pkg install python -y
```
2. Make the script executable:
```bash
chmod +x yiffer_scraper.py
```
3. Run the script:
```bash
./yiffer_scraper.py --help
```
## Commands
### Browse
View the latest comics on the site:
```bash
./yiffer_scraper.py browse
./yiffer_scraper.py browse --page 2
```
### Search
Search for specific comics:
```bash
./yiffer_scraper.py browse --search "Bifurcation"
```
### Download
Download a specific comic by exact name:
```bash
./yiffer_scraper.py download "Bifurcation" --output ./my_comics
```
### Download Recent (Bulk)
Download the latest `N` comics from the homepage automatically:
```bash
./yiffer_scraper.py download-recent --count 10 --output ./my_comics
```

266
yiffer_scraper.py Executable file
View File

@@ -0,0 +1,266 @@
#!/usr/bin/env python3
import urllib.request
import urllib.parse
import re
import json
import os
import sys
import tempfile
import zipfile
import argparse
import concurrent.futures
BASE_URL = "https://yiffer.xyz"
PICS_BASE_URL = "https://pics.yiffer.xyz"
def fetch_html(url):
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
try:
with urllib.request.urlopen(req) as response:
return response.read().decode('utf-8')
except urllib.error.HTTPError as e:
if e.code == 404:
return None
raise
def parse_turbo_stream(html):
"""Extracts and parses the Remix turbo-stream JSON structure."""
if not html:
return []
enqueues = re.findall(r'streamController\.enqueue\((.*?)\);', html, re.DOTALL)
arr = []
for e in enqueues:
try:
# First loads decodes the JS string literal
decoded_str = json.loads(e)
# Second loads parses the JSON array string
part_arr = json.loads(decoded_str)
arr.extend(part_arr)
except Exception:
pass
return arr
def browse_comics(page=1, search=None):
"""Browse or search comics from yiffer.xyz."""
url = f"{BASE_URL}/browse?page={page}"
if search:
url += f"&search={urllib.parse.quote(search)}"
print(f"Fetching: {url}")
html = fetch_html(url)
arr = parse_turbo_stream(html)
try:
comics_idx = arr.index("comicsAndAds")
comics_arr = arr[comics_idx + 1]
except ValueError:
print("Error: Could not find comics listing in the stream.")
return [], 0
try:
id_key = f"_{arr.index('id')}"
name_key = f"_{arr.index('name')}"
pages_key = f"_{arr.index('numberOfPages')}"
except ValueError:
print("Error: Could not locate required keys in the stream.")
return [], 0
total_comics = 0
try:
tc_idx = arr.index("totalNumComics")
val = arr[tc_idx + 1]
if isinstance(val, int):
total_comics = val
except ValueError:
pass
comics = []
for ptr in comics_arr:
item_obj = arr[ptr]
if isinstance(item_obj, dict) and id_key in item_obj and name_key in item_obj:
c_id = arr[item_obj[id_key]]
c_name = arr[item_obj[name_key]]
c_pages = arr[item_obj[pages_key]] if pages_key in item_obj else 0
comics.append({"id": c_id, "name": c_name, "pages": c_pages})
return comics, total_comics
def get_comic_data(comic_name):
"""Get metadata and page tokens for a specific comic."""
url = f"{BASE_URL}/c/{urllib.parse.quote(comic_name)}"
html = fetch_html(url)
if not html:
print(f"Comic '{comic_name}' not found.")
return None
arr = parse_turbo_stream(html)
if not arr:
print("Failed to parse comic stream.")
return None
try:
id_idx = arr.index("id")
comic_id = arr[id_idx + 1]
except ValueError:
print("Could not find comic ID.")
return None
try:
name_idx = arr.index("name")
comic_name_real = arr[name_idx + 1]
except ValueError:
comic_name_real = comic_name
try:
pages_idx = arr.index("pages")
pages_arr = arr[pages_idx + 1]
except ValueError:
print("Could not find pages array.")
return None
try:
token_str_idx = arr.index("token")
token_key = f"_{token_str_idx}"
except ValueError:
print("Could not find token string in array.")
return None
tokens = []
for ptr in pages_arr:
page_obj = arr[ptr]
if token_key in page_obj:
target_ptr = page_obj[token_key]
tokens.append(arr[target_ptr])
return {
"id": comic_id,
"name": comic_name_real,
"pages": tokens
}
def download_image(url, output_path):
"""Download an image from the given URL and save it to output_path."""
try:
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req) as response:
with open(output_path, 'wb') as f:
f.write(response.read())
return True, "Success"
except Exception as e:
return False, str(e)
def download_comic(comic_name, output_dir, max_workers=5):
"""Download all pages of a comic directly into a CBZ file."""
print(f"Fetching metadata for '{comic_name}'...")
comic_data = get_comic_data(comic_name)
if not comic_data:
return
c_id = comic_data['id']
c_name = comic_data['name']
tokens = comic_data['pages']
num_pages = len(tokens)
print(f"Found comic: {c_name} (ID: {c_id}) with {num_pages} pages.")
safe_name = re.sub(r'[\\/*?:"<>|]', "", c_name)
os.makedirs(output_dir, exist_ok=True)
cbz_filepath = os.path.join(output_dir, f"{safe_name}.cbz")
if os.path.exists(cbz_filepath) and os.path.getsize(cbz_filepath) > 0:
print(f"Skipping: {cbz_filepath} already exists.")
return
print(f"Downloading {num_pages} pages into temporary directory before zipping...")
with tempfile.TemporaryDirectory() as tmp_dir:
tasks = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
for i, token in enumerate(tokens, 1):
url = f"{PICS_BASE_URL}/comics/{c_id}/{token}.jpg"
filename = f"{i:03d}.jpg"
filepath = os.path.join(tmp_dir, filename)
tasks.append(
(i, num_pages, filename, executor.submit(download_image, url, filepath))
)
all_successful = True
for i, num_pages, filename, future in tasks:
success, msg = future.result()
if success:
print(f"[{i:03d}/{num_pages:03d}] Downloaded: {filename}")
else:
print(f"[{i:03d}/{num_pages:03d}] FAILED: {filename} - {msg}")
all_successful = False
if all_successful:
print(f"Packaging into {cbz_filepath}...")
with zipfile.ZipFile(cbz_filepath, 'w', zipfile.ZIP_STORED) as cbz:
for i in range(1, num_pages + 1):
filename = f"{i:03d}.jpg"
filepath = os.path.join(tmp_dir, filename)
cbz.write(filepath, arcname=filename)
print(f"\nDownload complete! Saved to {cbz_filepath}")
else:
print("\nDownload incomplete due to failures. Skipping CBZ creation.")
def main():
parser = argparse.ArgumentParser(description="Scraper for Yiffer.xyz comics")
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Browse/Search Command
cmd_browse = subparsers.add_parser("browse", help="Browse or search for comics")
cmd_browse.add_argument("--page", type=int, default=1, help="Page number to view (default 1)")
cmd_browse.add_argument("--search", type=str, default="", help="Search query")
# Download Command
cmd_download = subparsers.add_parser("download", help="Download a specific comic by name")
cmd_download.add_argument("name", type=str, help="Exact name of the comic to download (case-insensitive, exact match on URL)")
cmd_download.add_argument("--output", type=str, default="downloads", help="Output directory to save the comic (default 'downloads')")
cmd_download.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)")
# Bulk Download Command
cmd_recent = subparsers.add_parser("download-recent", help="Download a batch of the most recent comics")
cmd_recent.add_argument("--count", type=int, default=5, help="Number of recent comics to download (default 5)")
cmd_recent.add_argument("--output", type=str, default="downloads", help="Output directory to save the comics")
cmd_recent.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)")
args = parser.parse_args()
if args.command == "browse":
print(f"--- Yiffer.xyz Browse (Page {args.page}) {'Search: ' + args.search if args.search else ''} ---")
comics, total = browse_comics(args.page, args.search)
total = int(total) if total else 0
if total > 0:
print(f"Total comics found: {total}")
print("-" * 60)
for i, c in enumerate(comics, 1):
print(f"{i:2d}. {c['name']} ({c['pages']} pages) [ID: {c['id']}]")
print("-" * 60)
elif args.command == "download":
download_comic(args.name, args.output, args.workers)
elif args.command == "download-recent":
# Always fetch page 1 to get the most recent ones
print(f"Fetching the {args.count} most recent comics...")
comics, _ = browse_comics(1, "")
if not comics:
print("Could not retrieve comics list.")
return
comics_to_download = comics[:args.count]
for idx, comic in enumerate(comics_to_download, 1):
print(f"\n--- [{idx}/{len(comics_to_download)}] Downloading: {comic['name']} ---")
download_comic(comic['name'], args.output, args.workers)
else:
parser.print_help()
if __name__ == "__main__":
main()