Post-Local sync at 2025-06-23T22:46:07Z

2025-06-23 17:55:02 -05:00 · 2025-06-23 17:55:02 -05:00 · 9f97801b0d
commit 9f97801b0d
parent 9d33b42020
1387 changed files with 250216 additions and 117 deletions
--- a/.venv/lib/python3.12/site-packages/internetarchive/cli/ia_upload.py
+++ b/.venv/lib/python3.12/site-packages/internetarchive/cli/ia_upload.py
@ -0,0 +1,376 @@
+"""
+ia_upload.py
+
+'ia' subcommand for uploading files to archive.org.
+"""
+
+# Copyright (C) 2012-2024 Internet Archive
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import argparse
+import csv
+import os
+import sys
+import webbrowser
+from copy import deepcopy
+from locale import getpreferredencoding
+from tempfile import TemporaryFile
+from typing import Union
+
+from requests.exceptions import HTTPError
+
+from internetarchive.cli.cli_utils import (
+    MetadataAction,
+    QueryStringAction,
+    get_args_dict,
+    validate_identifier,
+)
+from internetarchive.utils import (
+    InvalidIdentifierException,
+    JSONDecodeError,
+    is_valid_metadata_key,
+    json,
+)
+
+
+def setup(subparsers):
+    """
+    Setup args for copy command.
+
+    Args:
+        subparsers: subparser object passed from ia.py
+    """
+    parser = subparsers.add_parser("upload",
+                                   aliases=["up"],
+                                   help="Upload files to archive.org")
+
+    # Positional arguments
+    parser.add_argument("identifier",
+                        type=validate_identifier,
+                        nargs="?",
+                        default=None,
+                        help="Identifier for the upload")
+    parser.add_argument("file",
+                        nargs="*",
+                        type=validate_file,
+                        help="File(s) to upload")
+
+    # Options
+    parser.add_argument("-q", "--quiet",
+                        action="store_true",
+                        help="Turn off ia's output")
+    parser.add_argument("-d", "--debug",
+                        action="store_true",
+                        help=("Print S3 request parameters to stdout and exit without "
+                              "sending request"))
+    parser.add_argument("-r", "--remote-name",
+                        help=("When uploading data from stdin, "
+                             "this option sets the remote filename"))
+    parser.add_argument("-m", "--metadata",
+                        nargs="+",
+                        action=MetadataAction,
+                        metavar="KEY:VALUE",
+                        default={},
+                        help="Metadata to add to your item")
+    parser.add_argument("--spreadsheet",
+                        type=argparse.FileType("r", encoding="utf-8-sig"),
+                        help="Bulk uploading")
+    parser.add_argument("--file-metadata",
+                        type=argparse.FileType("r"),
+                        help="Upload files with file-level metadata via a file_md.jsonl file")
+    parser.add_argument("-H", "--header",
+                        nargs="+",
+                        action=QueryStringAction,
+                        default={},
+                        help="S3 HTTP headers to send with your request")
+    parser.add_argument("-c", "--checksum",
+                        action="store_true",
+                        help="Skip based on checksum")
+    parser.add_argument("-v", "--verify",
+                        action="store_true",
+                        help="Verify that data was not corrupted traversing the network")
+    parser.add_argument("-n", "--no-derive",
+                        action="store_true",
+                        help="Do not derive uploaded files")
+    parser.add_argument("--size-hint",
+                        help="Specify a size-hint for your item")
+    parser.add_argument("--delete",
+                        action="store_true",
+                        help="Delete files after verifying checksums")
+    parser.add_argument("-R", "--retries",
+                        type=int,
+                        help="Number of times to retry request if S3 returns a 503 SlowDown error")
+    parser.add_argument("-s", "--sleep",
+                        type=int,
+                        help="The amount of time to sleep between retries")
+    parser.add_argument("--no-collection-check",
+                        action="store_true",
+                        help="Skip collection exists check")
+    parser.add_argument("-o", "--open-after-upload",
+                        action="store_true",
+                        help="Open the details page for an item after upload")
+    parser.add_argument("--no-backup",
+                        action="store_true",
+                        help="Turn off archive.org backups")
+    parser.add_argument("--keep-directories",
+                        action="store_true",
+                        help="Keep directories in the supplied file paths for the remote filename")
+    parser.add_argument("--no-scanner",
+                        action="store_true",
+                        help="Do not set the scanner field in meta.xml")
+    parser.add_argument("--status-check",
+                        action="store_true",
+                        help="Check if S3 is accepting requests to the given item")
+
+    parser.set_defaults(func=lambda args: main(args, parser))
+
+
+def _upload_files(item, files, upload_kwargs, prev_identifier=None):
+    """
+    Helper function for calling :meth:`Item.upload`
+    """
+    # Check if the list has any element.
+    if not files:
+        raise FileNotFoundError("No valid file was found. Check your paths.")
+
+    responses = []
+    if (upload_kwargs["verbose"]) and (prev_identifier != item.identifier):
+        print(f"{item.identifier}:", file=sys.stderr)
+
+    try:
+        response = item.upload(files, **upload_kwargs)
+        responses += response
+    except HTTPError as exc:
+        responses += [exc.response]
+    except InvalidIdentifierException as exc:
+        print(str(exc), file=sys.stderr)
+        sys.exit(1)
+    finally:
+        # Debug mode.
+        if upload_kwargs["debug"]:
+            for i, r in enumerate(responses):
+                if i != 0:
+                    print("---", file=sys.stderr)
+                headers = "\n".join(
+                    [f" {k}:{v}" for (k, v) in r.headers.items()]
+                )
+                print(f"Endpoint:\n {r.url}\n", file=sys.stderr)
+                print(f"HTTP Headers:\n{headers}", file=sys.stderr)
+
+    return responses
+
+
+def uploading_from_stdin(args):
+    """
+    Check if the user is uploading from stdin.
+    """
+    if not args.file:
+        return False
+    elif len(args.file) == 1 and args.file[0] == "-":
+        return True
+    return False
+
+
+def check_if_file_arg_required(args, parser):
+    required_if_no_file = [args.spreadsheet, args.file_metadata, args.status_check]
+    if not args.file and not any(required_if_no_file):
+        parser.error("You must specify a file to upload.")
+
+
+def validate_file(arg):
+    if os.path.exists(arg) or arg == "-":
+        return arg
+    else:
+        raise argparse.ArgumentTypeError(f"'{arg}' is not a valid file or directory")
+
+
+def main(args, parser): # noqa: PLR0912,C901
+    # TODO: Refactor to deal with PLR0912 and C901
+    # add type hints
+    """
+    Main entry point for 'ia upload'.
+    """
+    check_if_file_arg_required(args, parser)
+
+    if uploading_from_stdin(args) and not args.remote_name:
+        parser.error("When uploading from stdin, "
+                     "you must specify a remote filename with --remote-name")
+
+    if args.status_check:  # TODO: support for checking if a specific bucket is overloaded
+        if args.session.s3_is_overloaded():
+            print(f"warning: {args.identifier} is over limit, and not accepting requests. "
+                  "Expect 503 SlowDown errors.",
+                  file=sys.stderr)
+            sys.exit(1)
+        else:
+            print(f"success: {args.identifier} is accepting requests.", file=sys.stderr)
+            sys.exit(0)
+    elif args.identifier:
+        item = args.session.get_item(args.identifier)
+
+    # Prepare upload headers and kwargs
+    if args.no_derive:
+        queue_derive = False
+    else:
+        queue_derive = True
+    if args.quiet:
+        verbose = False
+    else:
+        verbose = True
+    if args.no_scanner:
+        set_scanner = False
+    else:
+        set_scanner = True
+    if args.size_hint:
+        args.header["x-archive-size-hint"] = args.size_hint
+    if not args.header.get("x-archive-keep-old-version") \
+            and not args.no_backup:
+        args.header["x-archive-keep-old-version"] = "1"
+
+    if args.file_metadata:
+        try:
+            with open(args.file_metadata) as fh:
+                args.file_metadata = json.load(fh)
+        except JSONDecodeError:
+            args.file = []
+            with open(args.file_metadata) as fh:
+                for line in fh:
+                    j = json.loads(line.strip())
+                    args.file.append(j)
+
+    upload_kwargs = {
+        "metadata": args.metadata,
+        "headers": args.header,
+        "debug": args.debug,
+        "queue_derive": queue_derive,
+        "set_scanner": set_scanner,
+        "verbose": verbose,
+        "verify": args.verify,
+        "checksum": args.checksum,
+        "retries": args.retries,
+        "retries_sleep": args.sleep,
+        "delete": args.delete,
+        "validate_identifier": True,
+    }
+
+    # Upload files
+    errors = False
+    if not args.spreadsheet:
+        if uploading_from_stdin(args):
+            local_file = TemporaryFile()
+            # sys.stdin normally has the buffer attribute which returns bytes.
+            # However, this might not always be the case, e.g. on mocking for test purposes.
+            # Fall back to reading as str and encoding back to bytes.
+            # Note that the encoding attribute might also be None. In that case, fall back to
+            # locale.getpreferredencoding, the default of io.TextIOWrapper and open().
+            if hasattr(sys.stdin, "buffer"):
+                def read():
+                    return sys.stdin.buffer.read(1048576)
+            else:
+                encoding = sys.stdin.encoding or getpreferredencoding(False)
+
+                def read():
+                    return sys.stdin.read(1048576).encode(encoding)
+            while True:
+                data = read()
+                if not data:
+                    break
+                local_file.write(data)
+            local_file.seek(0)
+        else:
+            local_file = args.file
+            # Properly expand a period to the contents of the current working directory.
+            if isinstance(local_file, str) and "." in local_file:
+                local_file = [p for p in local_file if p != "."]
+                local_file = os.listdir(".") + local_file
+
+        if isinstance(local_file, (list, tuple, set)) and args.remote_name:
+            local_file = local_file[0]
+        if args.remote_name:
+            files = {args.remote_name: local_file}
+        elif args.keep_directories:
+            files = {f: f for f in local_file}
+        else:
+            files = local_file
+
+        for _r in _upload_files(item, files, upload_kwargs):
+            if args.debug:
+                break
+
+            # Check if Response is empty first (i.e. --checksum)
+            # TODO: Should upload return something other than an empty Response
+            # object if checksum is set and the file is already in the item?
+            if _r.status_code is None:
+                pass
+            elif not _r.ok:
+                errors = True
+            else:
+                if args.open_after_upload:
+                    url = f"{args.session.protocol}//{args.session.host}/details/{item.identifier}"
+                    webbrowser.open_new_tab(url)
+
+    # Bulk upload using spreadsheet.
+    else:
+        # Use the same session for each upload request.
+        with args.spreadsheet as csvfp:
+            spreadsheet = csv.DictReader(csvfp)
+            prev_identifier = None
+            for row in spreadsheet:
+                for metadata_key in row:
+                    if not is_valid_metadata_key(metadata_key):
+                        print(f"error: '{metadata_key}' is not a valid metadata key.",
+                              file=sys.stderr)
+                        sys.exit(1)
+                upload_kwargs_copy = deepcopy(upload_kwargs)
+                if row.get("REMOTE_NAME"):
+                    local_file = {row["REMOTE_NAME"]: row["file"]}
+                    del row["REMOTE_NAME"]
+                elif args.keep_directories:
+                    local_file = {row["file"]: row["file"]}
+                else:
+                    local_file = row["file"]
+                identifier = row.get("item", row.get("identifier"))
+                if not identifier:
+                    if not prev_identifier:
+                        print("error: no identifier column on spreadsheet.",
+                              file=sys.stderr)
+                        sys.exit(1)
+                    identifier = prev_identifier
+                del row["file"]
+                if "identifier" in row:
+                    del row["identifier"]
+                if "item" in row:
+                    del row["item"]
+                item = args.session.get_item(identifier)
+                # TODO: Clean up how indexed metadata items are coerced
+                # into metadata.
+                md_args = [f"{k.lower()}:{v}" for (k, v) in row.items() if v]
+                metadata = get_args_dict(md_args)
+                upload_kwargs_copy["metadata"].update(metadata)
+                r = _upload_files(item, local_file, upload_kwargs_copy, prev_identifier)
+                for _r in r:
+                    if args.debug:
+                        break
+                    if (not _r.status_code) or (not _r.ok):
+                        errors = True
+                    else:
+                        if args.open_after_upload:
+                            url = (f"{args.session.protocol}//{args.session.host}"
+                                    "/details/{identifier}")
+                            webbrowser.open_new_tab(url)
+                prev_identifier = identifier
+
+    if errors:
+        sys.exit(1)