""" ia_download.py 'ia' subcommand for downloading files from archive.org. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import sys from typing import TextIO from internetarchive.cli.cli_utils import ( QueryStringAction, validate_dir_path, validate_identifier, ) from internetarchive.files import File from internetarchive.search import Search def setup(subparsers): """ Setup args for download command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("download", aliases=["do"], help="Retrieve and modify archive.org item metadata") # Main options parser.add_argument("identifier", nargs="?", type=validate_identifier, help="Identifier for the upload") parser.add_argument("file", nargs="*", help="Files to download") # Additional options parser.add_argument("-q", "--quiet", action="store_true", help="Turn off ia's output [default: False]") parser.add_argument("-d", "--dry-run", action="store_true", help="Print URLs to stdout and exit") parser.add_argument("-i", "--ignore-existing", action="store_true", help="Clobber files already downloaded") parser.add_argument("-C", "--checksum", action="store_true", help="Skip files based on checksum [default: False]") parser.add_argument("--checksum-archive", action="store_true", help="Skip files based on _checksum_archive.txt file") parser.add_argument("-R", "--retries", type=int, default=5, help="Set number of retries to [default: 5]") parser.add_argument("-I", "--itemlist", type=argparse.FileType("r"), help=("Download items from a specified file. " "Itemlists should be a plain text file with one " "identifier per line")) parser.add_argument("-S", "--search", help="Download items returned from a specified search query") parser.add_argument("-P", "--search-parameters", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Parameters to send with your --search query") parser.add_argument("-g", "--glob", help=("Only download files whose filename matches " "the given glob pattern")) parser.add_argument("-e", "--exclude", help=("Exclude files whose filename matches " "the given glob pattern")) parser.add_argument("-f", "--format", nargs="+", help=("Only download files of the specified format. " "Use this option multiple times to download " "multiple formats. You can use the following command to " "retrieve a list of file formats contained within a " "given item: ia metadata --formats ")) parser.add_argument("--on-the-fly", action="store_true", help=("Download on-the-fly files, as well as other " "matching files. on-the-fly files include derivative " "EPUB, MOBI and DAISY files [default: False]")) parser.add_argument("--no-directories", action="store_true", help=("Download files into working directory. " "Do not create item directories")) parser.add_argument("--destdir", type=validate_dir_path, help=("The destination directory to download files " "and item directories to")) parser.add_argument("-s", "--stdout", action="store_true", help="Write file contents to stdout") parser.add_argument("--no-change-timestamp", action="store_true", help=("Don't change the timestamp of downloaded files to reflect " "the source material")) parser.add_argument("-p", "--parameters", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Parameters to send with your download request (e.g. `cnt=0`)") parser.add_argument("-a", "--download-history", action="store_true", help="Also download files from the history directory") parser.add_argument("--source", nargs="+", help=("Filter files based on their source value in files.xml " "(i.e. `original`, `derivative`, `metadata`)")) parser.add_argument("--exclude-source", nargs="+", help=("Filter files based on their source value in files.xml " "(i.e. `original`, `derivative`, `metadata`)")) parser.add_argument("-t", "--timeout", type=float, help=("Set a timeout for download requests. " "This sets both connect and read timeout")) parser.set_defaults(func=lambda args: main(args, parser)) def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for 'ia download'. """ ids: list[File | str] | Search | TextIO if args.itemlist: ids = [x.strip() for x in args.itemlist] total_ids = len(ids) elif args.search: try: _search = args.session.search_items(args.search, params=args.search_parameters) total_ids = _search.num_found if total_ids == 0: print(f"error: the query '{args.search}' returned no results", file=sys.stderr) sys.exit(1) ids = _search except ValueError as e: print(f"error: {e}", file=sys.stderr) sys.exit(1) # Download specific files. if args.identifier and args.identifier != "-": if "/" in args.identifier: identifier = args.identifier.split("/")[0] files = ["/".join(args.identifier.split("/")[1:])] else: identifier = args.identifier files = args.file total_ids = 1 ids = [identifier] elif args.identifier == "-": total_ids = 1 ids = sys.stdin files = None else: files = None errors = [] for i, identifier in enumerate(ids): try: identifier = identifier.strip() except AttributeError: identifier = identifier.get("identifier") if total_ids > 1: item_index = f"{i + 1}/{total_ids}" else: item_index = None try: item = args.session.get_item(identifier) except Exception as exc: print(f"{identifier}: failed to retrieve item metadata - errors", file=sys.stderr) if "You are attempting to make an HTTPS" in str(exc): print(f"\n{exc}", file=sys.stderr) sys.exit(1) else: continue # Otherwise, download the entire item. ignore_history_dir = bool(args.download_history) _errors = item.download( files=files, formats=args.format, glob_pattern=args.glob, exclude_pattern=args.exclude, dry_run=args.dry_run, verbose=not args.quiet, ignore_existing=args.ignore_existing, checksum=args.checksum, checksum_archive=args.checksum_archive, destdir=args.destdir, no_directory=args.no_directories, retries=args.retries, item_index=item_index, ignore_errors=True, on_the_fly=args.on_the_fly, no_change_timestamp=args.no_change_timestamp, params=args.parameters, ignore_history_dir=ignore_history_dir, source=args.source, exclude_source=args.exclude_source, stdout=args.stdout, timeout=args.timeout, ) if _errors: errors.append(_errors) if errors: # TODO: add option for a summary/report. sys.exit(1) else: sys.exit(0)