Initial commit of Yiffer CLI scraper (Termux compatible) with CBZ support
This commit is contained in:
66
README.md
Normal file
66
README.md
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
# Yiffer Scraper
|
||||||
|
|
||||||
|
A pure Python scraper to download comics from yiffer.xyz, automatically packing them into `.cbz` files.
|
||||||
|
|
||||||
|
It is completely written using Python's standard library, meaning it works out-of-the-box anywhere Python 3 is installed—including on Android via Termux!
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- **Browse & Search:** Search for comics directly from the terminal.
|
||||||
|
- **Concurrent Downloads:** Download pages in parallel.
|
||||||
|
- **CBZ Export:** Packages downloads directly into CBZ format.
|
||||||
|
- **Termux Compatible:** Requires zero third-party packages (no `pip install requests` necessary).
|
||||||
|
|
||||||
|
## Installation & Usage (All Platforms)
|
||||||
|
|
||||||
|
You only need Python 3 installed. If you're on Linux, macOS, or Windows, simply download `yiffer_scraper.py` and run it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 yiffer_scraper.py browse
|
||||||
|
python3 yiffer_scraper.py download "Comic Name"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Termux Setup (Android)
|
||||||
|
|
||||||
|
If you are using Termux on Android, setup is extremely simple:
|
||||||
|
|
||||||
|
1. Install Python:
|
||||||
|
```bash
|
||||||
|
pkg update && pkg install python -y
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Make the script executable:
|
||||||
|
```bash
|
||||||
|
chmod +x yiffer_scraper.py
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Run the script:
|
||||||
|
```bash
|
||||||
|
./yiffer_scraper.py --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## Commands
|
||||||
|
|
||||||
|
### Browse
|
||||||
|
View the latest comics on the site:
|
||||||
|
```bash
|
||||||
|
./yiffer_scraper.py browse
|
||||||
|
./yiffer_scraper.py browse --page 2
|
||||||
|
```
|
||||||
|
|
||||||
|
### Search
|
||||||
|
Search for specific comics:
|
||||||
|
```bash
|
||||||
|
./yiffer_scraper.py browse --search "Bifurcation"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Download
|
||||||
|
Download a specific comic by exact name:
|
||||||
|
```bash
|
||||||
|
./yiffer_scraper.py download "Bifurcation" --output ./my_comics
|
||||||
|
```
|
||||||
|
|
||||||
|
### Download Recent (Bulk)
|
||||||
|
Download the latest `N` comics from the homepage automatically:
|
||||||
|
```bash
|
||||||
|
./yiffer_scraper.py download-recent --count 10 --output ./my_comics
|
||||||
|
```
|
||||||
266
yiffer_scraper.py
Executable file
266
yiffer_scraper.py
Executable file
@@ -0,0 +1,266 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import zipfile
|
||||||
|
import argparse
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
BASE_URL = "https://yiffer.xyz"
|
||||||
|
PICS_BASE_URL = "https://pics.yiffer.xyz"
|
||||||
|
|
||||||
|
def fetch_html(url):
|
||||||
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req) as response:
|
||||||
|
return response.read().decode('utf-8')
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 404:
|
||||||
|
return None
|
||||||
|
raise
|
||||||
|
|
||||||
|
def parse_turbo_stream(html):
|
||||||
|
"""Extracts and parses the Remix turbo-stream JSON structure."""
|
||||||
|
if not html:
|
||||||
|
return []
|
||||||
|
|
||||||
|
enqueues = re.findall(r'streamController\.enqueue\((.*?)\);', html, re.DOTALL)
|
||||||
|
arr = []
|
||||||
|
for e in enqueues:
|
||||||
|
try:
|
||||||
|
# First loads decodes the JS string literal
|
||||||
|
decoded_str = json.loads(e)
|
||||||
|
# Second loads parses the JSON array string
|
||||||
|
part_arr = json.loads(decoded_str)
|
||||||
|
arr.extend(part_arr)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return arr
|
||||||
|
|
||||||
|
def browse_comics(page=1, search=None):
|
||||||
|
"""Browse or search comics from yiffer.xyz."""
|
||||||
|
url = f"{BASE_URL}/browse?page={page}"
|
||||||
|
if search:
|
||||||
|
url += f"&search={urllib.parse.quote(search)}"
|
||||||
|
|
||||||
|
print(f"Fetching: {url}")
|
||||||
|
html = fetch_html(url)
|
||||||
|
arr = parse_turbo_stream(html)
|
||||||
|
|
||||||
|
try:
|
||||||
|
comics_idx = arr.index("comicsAndAds")
|
||||||
|
comics_arr = arr[comics_idx + 1]
|
||||||
|
except ValueError:
|
||||||
|
print("Error: Could not find comics listing in the stream.")
|
||||||
|
return [], 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
id_key = f"_{arr.index('id')}"
|
||||||
|
name_key = f"_{arr.index('name')}"
|
||||||
|
pages_key = f"_{arr.index('numberOfPages')}"
|
||||||
|
except ValueError:
|
||||||
|
print("Error: Could not locate required keys in the stream.")
|
||||||
|
return [], 0
|
||||||
|
|
||||||
|
total_comics = 0
|
||||||
|
try:
|
||||||
|
tc_idx = arr.index("totalNumComics")
|
||||||
|
val = arr[tc_idx + 1]
|
||||||
|
if isinstance(val, int):
|
||||||
|
total_comics = val
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
comics = []
|
||||||
|
for ptr in comics_arr:
|
||||||
|
item_obj = arr[ptr]
|
||||||
|
if isinstance(item_obj, dict) and id_key in item_obj and name_key in item_obj:
|
||||||
|
c_id = arr[item_obj[id_key]]
|
||||||
|
c_name = arr[item_obj[name_key]]
|
||||||
|
c_pages = arr[item_obj[pages_key]] if pages_key in item_obj else 0
|
||||||
|
comics.append({"id": c_id, "name": c_name, "pages": c_pages})
|
||||||
|
|
||||||
|
return comics, total_comics
|
||||||
|
|
||||||
|
def get_comic_data(comic_name):
|
||||||
|
"""Get metadata and page tokens for a specific comic."""
|
||||||
|
url = f"{BASE_URL}/c/{urllib.parse.quote(comic_name)}"
|
||||||
|
html = fetch_html(url)
|
||||||
|
|
||||||
|
if not html:
|
||||||
|
print(f"Comic '{comic_name}' not found.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
arr = parse_turbo_stream(html)
|
||||||
|
if not arr:
|
||||||
|
print("Failed to parse comic stream.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
id_idx = arr.index("id")
|
||||||
|
comic_id = arr[id_idx + 1]
|
||||||
|
except ValueError:
|
||||||
|
print("Could not find comic ID.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
name_idx = arr.index("name")
|
||||||
|
comic_name_real = arr[name_idx + 1]
|
||||||
|
except ValueError:
|
||||||
|
comic_name_real = comic_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
pages_idx = arr.index("pages")
|
||||||
|
pages_arr = arr[pages_idx + 1]
|
||||||
|
except ValueError:
|
||||||
|
print("Could not find pages array.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
token_str_idx = arr.index("token")
|
||||||
|
token_key = f"_{token_str_idx}"
|
||||||
|
except ValueError:
|
||||||
|
print("Could not find token string in array.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
for ptr in pages_arr:
|
||||||
|
page_obj = arr[ptr]
|
||||||
|
if token_key in page_obj:
|
||||||
|
target_ptr = page_obj[token_key]
|
||||||
|
tokens.append(arr[target_ptr])
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": comic_id,
|
||||||
|
"name": comic_name_real,
|
||||||
|
"pages": tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
def download_image(url, output_path):
|
||||||
|
"""Download an image from the given URL and save it to output_path."""
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||||
|
with urllib.request.urlopen(req) as response:
|
||||||
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(response.read())
|
||||||
|
return True, "Success"
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
def download_comic(comic_name, output_dir, max_workers=5):
|
||||||
|
"""Download all pages of a comic directly into a CBZ file."""
|
||||||
|
print(f"Fetching metadata for '{comic_name}'...")
|
||||||
|
comic_data = get_comic_data(comic_name)
|
||||||
|
|
||||||
|
if not comic_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
c_id = comic_data['id']
|
||||||
|
c_name = comic_data['name']
|
||||||
|
tokens = comic_data['pages']
|
||||||
|
num_pages = len(tokens)
|
||||||
|
|
||||||
|
print(f"Found comic: {c_name} (ID: {c_id}) with {num_pages} pages.")
|
||||||
|
|
||||||
|
safe_name = re.sub(r'[\\/*?:"<>|]', "", c_name)
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
cbz_filepath = os.path.join(output_dir, f"{safe_name}.cbz")
|
||||||
|
|
||||||
|
if os.path.exists(cbz_filepath) and os.path.getsize(cbz_filepath) > 0:
|
||||||
|
print(f"Skipping: {cbz_filepath} already exists.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Downloading {num_pages} pages into temporary directory before zipping...")
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tasks = []
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
for i, token in enumerate(tokens, 1):
|
||||||
|
url = f"{PICS_BASE_URL}/comics/{c_id}/{token}.jpg"
|
||||||
|
filename = f"{i:03d}.jpg"
|
||||||
|
filepath = os.path.join(tmp_dir, filename)
|
||||||
|
|
||||||
|
tasks.append(
|
||||||
|
(i, num_pages, filename, executor.submit(download_image, url, filepath))
|
||||||
|
)
|
||||||
|
|
||||||
|
all_successful = True
|
||||||
|
for i, num_pages, filename, future in tasks:
|
||||||
|
success, msg = future.result()
|
||||||
|
if success:
|
||||||
|
print(f"[{i:03d}/{num_pages:03d}] Downloaded: {filename}")
|
||||||
|
else:
|
||||||
|
print(f"[{i:03d}/{num_pages:03d}] FAILED: {filename} - {msg}")
|
||||||
|
all_successful = False
|
||||||
|
|
||||||
|
if all_successful:
|
||||||
|
print(f"Packaging into {cbz_filepath}...")
|
||||||
|
with zipfile.ZipFile(cbz_filepath, 'w', zipfile.ZIP_STORED) as cbz:
|
||||||
|
for i in range(1, num_pages + 1):
|
||||||
|
filename = f"{i:03d}.jpg"
|
||||||
|
filepath = os.path.join(tmp_dir, filename)
|
||||||
|
cbz.write(filepath, arcname=filename)
|
||||||
|
print(f"\nDownload complete! Saved to {cbz_filepath}")
|
||||||
|
else:
|
||||||
|
print("\nDownload incomplete due to failures. Skipping CBZ creation.")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Scraper for Yiffer.xyz comics")
|
||||||
|
subparsers = parser.add_subparsers(dest="command", help="Command to run")
|
||||||
|
|
||||||
|
# Browse/Search Command
|
||||||
|
cmd_browse = subparsers.add_parser("browse", help="Browse or search for comics")
|
||||||
|
cmd_browse.add_argument("--page", type=int, default=1, help="Page number to view (default 1)")
|
||||||
|
cmd_browse.add_argument("--search", type=str, default="", help="Search query")
|
||||||
|
|
||||||
|
# Download Command
|
||||||
|
cmd_download = subparsers.add_parser("download", help="Download a specific comic by name")
|
||||||
|
cmd_download.add_argument("name", type=str, help="Exact name of the comic to download (case-insensitive, exact match on URL)")
|
||||||
|
cmd_download.add_argument("--output", type=str, default="downloads", help="Output directory to save the comic (default 'downloads')")
|
||||||
|
cmd_download.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)")
|
||||||
|
|
||||||
|
# Bulk Download Command
|
||||||
|
cmd_recent = subparsers.add_parser("download-recent", help="Download a batch of the most recent comics")
|
||||||
|
cmd_recent.add_argument("--count", type=int, default=5, help="Number of recent comics to download (default 5)")
|
||||||
|
cmd_recent.add_argument("--output", type=str, default="downloads", help="Output directory to save the comics")
|
||||||
|
cmd_recent.add_argument("--workers", type=int, default=5, help="Number of concurrent downloads (default 5)")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "browse":
|
||||||
|
print(f"--- Yiffer.xyz Browse (Page {args.page}) {'Search: ' + args.search if args.search else ''} ---")
|
||||||
|
comics, total = browse_comics(args.page, args.search)
|
||||||
|
total = int(total) if total else 0
|
||||||
|
if total > 0:
|
||||||
|
print(f"Total comics found: {total}")
|
||||||
|
print("-" * 60)
|
||||||
|
for i, c in enumerate(comics, 1):
|
||||||
|
print(f"{i:2d}. {c['name']} ({c['pages']} pages) [ID: {c['id']}]")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
elif args.command == "download":
|
||||||
|
download_comic(args.name, args.output, args.workers)
|
||||||
|
|
||||||
|
elif args.command == "download-recent":
|
||||||
|
# Always fetch page 1 to get the most recent ones
|
||||||
|
print(f"Fetching the {args.count} most recent comics...")
|
||||||
|
comics, _ = browse_comics(1, "")
|
||||||
|
if not comics:
|
||||||
|
print("Could not retrieve comics list.")
|
||||||
|
return
|
||||||
|
|
||||||
|
comics_to_download = comics[:args.count]
|
||||||
|
for idx, comic in enumerate(comics_to_download, 1):
|
||||||
|
print(f"\n--- [{idx}/{len(comics_to_download)}] Downloading: {comic['name']} ---")
|
||||||
|
download_comic(comic['name'], args.output, args.workers)
|
||||||
|
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user