Post-Local sync at 2025-06-23T22:46:07Z

2025-06-23 17:55:02 -05:00 · 2025-06-23 17:55:02 -05:00 · 9f97801b0d
commit 9f97801b0d
parent 9d33b42020
1387 changed files with 250216 additions and 117 deletions
--- a/.venv/lib/python3.12/site-packages/internetarchive/cli/ia_download.py
+++ b/.venv/lib/python3.12/site-packages/internetarchive/cli/ia_download.py
@ -0,0 +1,238 @@
+"""
+ia_download.py
+
+'ia' subcommand for downloading files from archive.org.
+"""
+
+# Copyright (C) 2012-2024 Internet Archive
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import TextIO
+
+from internetarchive.cli.cli_utils import (
+    QueryStringAction,
+    validate_dir_path,
+    validate_identifier,
+)
+from internetarchive.files import File
+from internetarchive.search import Search
+
+
+def setup(subparsers):
+    """
+    Setup args for download command.
+
+    Args:
+        subparsers: subparser object passed from ia.py
+    """
+    parser = subparsers.add_parser("download",
+                                   aliases=["do"],
+                                   help="Retrieve and modify archive.org item metadata")
+
+    # Main options
+    parser.add_argument("identifier",
+                        nargs="?",
+                        type=validate_identifier,
+                        help="Identifier for the upload")
+    parser.add_argument("file",
+                        nargs="*",
+                        help="Files to download")
+
+    # Additional options
+    parser.add_argument("-q", "--quiet",
+                        action="store_true",
+                        help="Turn off ia's output [default: False]")
+    parser.add_argument("-d", "--dry-run",
+                        action="store_true",
+                        help="Print URLs to stdout and exit")
+    parser.add_argument("-i", "--ignore-existing",
+                        action="store_true",
+                        help="Clobber files already downloaded")
+    parser.add_argument("-C", "--checksum",
+                        action="store_true",
+                        help="Skip files based on checksum [default: False]")
+    parser.add_argument("--checksum-archive",
+                        action="store_true",
+                        help="Skip files based on _checksum_archive.txt file")
+    parser.add_argument("-R", "--retries",
+                        type=int,
+                        default=5,
+                        help="Set number of retries to <retries> [default: 5]")
+    parser.add_argument("-I", "--itemlist",
+                        type=argparse.FileType("r"),
+                        help=("Download items from a specified file. "
+                             "Itemlists should be a plain text file with one "
+                             "identifier per line"))
+    parser.add_argument("-S", "--search",
+                        help="Download items returned from a specified search query")
+    parser.add_argument("-P", "--search-parameters",
+                        nargs="+",
+                        action=QueryStringAction,
+                        metavar="KEY:VALUE",
+                        help="Parameters to send with your --search query")
+    parser.add_argument("-g", "--glob",
+                        help=("Only download files whose filename matches "
+                             "the given glob pattern"))
+    parser.add_argument("-e", "--exclude",
+                        help=("Exclude files whose filename matches "
+                             "the given glob pattern"))
+    parser.add_argument("-f", "--format",
+                        nargs="+",
+                        help=("Only download files of the specified format. "
+                             "Use this option multiple times to download "
+                             "multiple formats. You can use the following command to "
+                             "retrieve a list of file formats contained within a "
+                             "given item: ia metadata --formats <identifier>"))
+    parser.add_argument("--on-the-fly",
+                        action="store_true",
+                        help=("Download on-the-fly files, as well as other "
+                             "matching files. on-the-fly files include derivative "
+                             "EPUB, MOBI and DAISY files [default: False]"))
+    parser.add_argument("--no-directories",
+                        action="store_true",
+                        help=("Download files into working directory. "
+                             "Do not create item directories"))
+    parser.add_argument("--destdir",
+                        type=validate_dir_path,
+                        help=("The destination directory to download files "
+                             "and item directories to"))
+    parser.add_argument("-s", "--stdout",
+                        action="store_true",
+                        help="Write file contents to stdout")
+    parser.add_argument("--no-change-timestamp",
+                        action="store_true",
+                        help=("Don't change the timestamp of downloaded files to reflect "
+                             "the source material"))
+    parser.add_argument("-p", "--parameters",
+                        nargs="+",
+                        action=QueryStringAction,
+                        metavar="KEY:VALUE",
+                        help="Parameters to send with your download request (e.g. `cnt=0`)")
+    parser.add_argument("-a", "--download-history",
+                        action="store_true",
+                        help="Also download files from the history directory")
+    parser.add_argument("--source",
+                        nargs="+",
+                        help=("Filter files based on their source value in files.xml "
+                             "(i.e. `original`, `derivative`, `metadata`)"))
+    parser.add_argument("--exclude-source",
+                        nargs="+",
+                        help=("Filter files based on their source value in files.xml "
+                             "(i.e. `original`, `derivative`, `metadata`)"))
+    parser.add_argument("-t", "--timeout",
+                        type=float,
+                        help=("Set a timeout for download requests. "
+                             "This sets both connect and read timeout"))
+
+    parser.set_defaults(func=lambda args: main(args, parser))
+
+
+def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
+    """
+    Main entry point for 'ia download'.
+    """
+    ids: list[File | str] | Search | TextIO
+
+    if args.itemlist:
+        ids = [x.strip() for x in args.itemlist]
+        total_ids = len(ids)
+    elif args.search:
+        try:
+            _search = args.session.search_items(args.search,
+                                                params=args.search_parameters)
+            total_ids = _search.num_found
+            if total_ids == 0:
+                print(f"error: the query '{args.search}' returned no results", file=sys.stderr)
+                sys.exit(1)
+            ids = _search
+        except ValueError as e:
+            print(f"error: {e}", file=sys.stderr)
+            sys.exit(1)
+
+    # Download specific files.
+    if args.identifier and args.identifier != "-":
+        if "/" in args.identifier:
+            identifier = args.identifier.split("/")[0]
+            files = ["/".join(args.identifier.split("/")[1:])]
+        else:
+            identifier = args.identifier
+            files = args.file
+        total_ids = 1
+        ids = [identifier]
+    elif args.identifier == "-":
+        total_ids = 1
+        ids = sys.stdin
+        files = None
+    else:
+        files = None
+
+    errors = []
+    for i, identifier in enumerate(ids):
+        try:
+            identifier = identifier.strip()
+        except AttributeError:
+            identifier = identifier.get("identifier")
+        if total_ids > 1:
+            item_index = f"{i + 1}/{total_ids}"
+        else:
+            item_index = None
+
+        try:
+            item = args.session.get_item(identifier)
+        except Exception as exc:
+            print(f"{identifier}: failed to retrieve item metadata - errors", file=sys.stderr)
+            if "You are attempting to make an HTTPS" in str(exc):
+                print(f"\n{exc}", file=sys.stderr)
+                sys.exit(1)
+            else:
+                continue
+
+        # Otherwise, download the entire item.
+        ignore_history_dir = bool(args.download_history)
+        _errors = item.download(
+            files=files,
+            formats=args.format,
+            glob_pattern=args.glob,
+            exclude_pattern=args.exclude,
+            dry_run=args.dry_run,
+            verbose=not args.quiet,
+            ignore_existing=args.ignore_existing,
+            checksum=args.checksum,
+            checksum_archive=args.checksum_archive,
+            destdir=args.destdir,
+            no_directory=args.no_directories,
+            retries=args.retries,
+            item_index=item_index,
+            ignore_errors=True,
+            on_the_fly=args.on_the_fly,
+            no_change_timestamp=args.no_change_timestamp,
+            params=args.parameters,
+            ignore_history_dir=ignore_history_dir,
+            source=args.source,
+            exclude_source=args.exclude_source,
+            stdout=args.stdout,
+            timeout=args.timeout,
+        )
+        if _errors:
+            errors.append(_errors)
+    if errors:
+        # TODO: add option for a summary/report.
+        sys.exit(1)
+    else:
+        sys.exit(0)