Post-Local sync at 2025-06-23T22:46:07Z
This commit is contained in:
parent
9d33b42020
commit
9f97801b0d
1387 changed files with 250216 additions and 117 deletions
|
@ -0,0 +1,376 @@
|
|||
"""
|
||||
ia_upload.py
|
||||
|
||||
'ia' subcommand for uploading files to archive.org.
|
||||
"""
|
||||
|
||||
# Copyright (C) 2012-2024 Internet Archive
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
import webbrowser
|
||||
from copy import deepcopy
|
||||
from locale import getpreferredencoding
|
||||
from tempfile import TemporaryFile
|
||||
from typing import Union
|
||||
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from internetarchive.cli.cli_utils import (
|
||||
MetadataAction,
|
||||
QueryStringAction,
|
||||
get_args_dict,
|
||||
validate_identifier,
|
||||
)
|
||||
from internetarchive.utils import (
|
||||
InvalidIdentifierException,
|
||||
JSONDecodeError,
|
||||
is_valid_metadata_key,
|
||||
json,
|
||||
)
|
||||
|
||||
|
||||
def setup(subparsers):
|
||||
"""
|
||||
Setup args for copy command.
|
||||
|
||||
Args:
|
||||
subparsers: subparser object passed from ia.py
|
||||
"""
|
||||
parser = subparsers.add_parser("upload",
|
||||
aliases=["up"],
|
||||
help="Upload files to archive.org")
|
||||
|
||||
# Positional arguments
|
||||
parser.add_argument("identifier",
|
||||
type=validate_identifier,
|
||||
nargs="?",
|
||||
default=None,
|
||||
help="Identifier for the upload")
|
||||
parser.add_argument("file",
|
||||
nargs="*",
|
||||
type=validate_file,
|
||||
help="File(s) to upload")
|
||||
|
||||
# Options
|
||||
parser.add_argument("-q", "--quiet",
|
||||
action="store_true",
|
||||
help="Turn off ia's output")
|
||||
parser.add_argument("-d", "--debug",
|
||||
action="store_true",
|
||||
help=("Print S3 request parameters to stdout and exit without "
|
||||
"sending request"))
|
||||
parser.add_argument("-r", "--remote-name",
|
||||
help=("When uploading data from stdin, "
|
||||
"this option sets the remote filename"))
|
||||
parser.add_argument("-m", "--metadata",
|
||||
nargs="+",
|
||||
action=MetadataAction,
|
||||
metavar="KEY:VALUE",
|
||||
default={},
|
||||
help="Metadata to add to your item")
|
||||
parser.add_argument("--spreadsheet",
|
||||
type=argparse.FileType("r", encoding="utf-8-sig"),
|
||||
help="Bulk uploading")
|
||||
parser.add_argument("--file-metadata",
|
||||
type=argparse.FileType("r"),
|
||||
help="Upload files with file-level metadata via a file_md.jsonl file")
|
||||
parser.add_argument("-H", "--header",
|
||||
nargs="+",
|
||||
action=QueryStringAction,
|
||||
default={},
|
||||
help="S3 HTTP headers to send with your request")
|
||||
parser.add_argument("-c", "--checksum",
|
||||
action="store_true",
|
||||
help="Skip based on checksum")
|
||||
parser.add_argument("-v", "--verify",
|
||||
action="store_true",
|
||||
help="Verify that data was not corrupted traversing the network")
|
||||
parser.add_argument("-n", "--no-derive",
|
||||
action="store_true",
|
||||
help="Do not derive uploaded files")
|
||||
parser.add_argument("--size-hint",
|
||||
help="Specify a size-hint for your item")
|
||||
parser.add_argument("--delete",
|
||||
action="store_true",
|
||||
help="Delete files after verifying checksums")
|
||||
parser.add_argument("-R", "--retries",
|
||||
type=int,
|
||||
help="Number of times to retry request if S3 returns a 503 SlowDown error")
|
||||
parser.add_argument("-s", "--sleep",
|
||||
type=int,
|
||||
help="The amount of time to sleep between retries")
|
||||
parser.add_argument("--no-collection-check",
|
||||
action="store_true",
|
||||
help="Skip collection exists check")
|
||||
parser.add_argument("-o", "--open-after-upload",
|
||||
action="store_true",
|
||||
help="Open the details page for an item after upload")
|
||||
parser.add_argument("--no-backup",
|
||||
action="store_true",
|
||||
help="Turn off archive.org backups")
|
||||
parser.add_argument("--keep-directories",
|
||||
action="store_true",
|
||||
help="Keep directories in the supplied file paths for the remote filename")
|
||||
parser.add_argument("--no-scanner",
|
||||
action="store_true",
|
||||
help="Do not set the scanner field in meta.xml")
|
||||
parser.add_argument("--status-check",
|
||||
action="store_true",
|
||||
help="Check if S3 is accepting requests to the given item")
|
||||
|
||||
parser.set_defaults(func=lambda args: main(args, parser))
|
||||
|
||||
|
||||
def _upload_files(item, files, upload_kwargs, prev_identifier=None):
|
||||
"""
|
||||
Helper function for calling :meth:`Item.upload`
|
||||
"""
|
||||
# Check if the list has any element.
|
||||
if not files:
|
||||
raise FileNotFoundError("No valid file was found. Check your paths.")
|
||||
|
||||
responses = []
|
||||
if (upload_kwargs["verbose"]) and (prev_identifier != item.identifier):
|
||||
print(f"{item.identifier}:", file=sys.stderr)
|
||||
|
||||
try:
|
||||
response = item.upload(files, **upload_kwargs)
|
||||
responses += response
|
||||
except HTTPError as exc:
|
||||
responses += [exc.response]
|
||||
except InvalidIdentifierException as exc:
|
||||
print(str(exc), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
# Debug mode.
|
||||
if upload_kwargs["debug"]:
|
||||
for i, r in enumerate(responses):
|
||||
if i != 0:
|
||||
print("---", file=sys.stderr)
|
||||
headers = "\n".join(
|
||||
[f" {k}:{v}" for (k, v) in r.headers.items()]
|
||||
)
|
||||
print(f"Endpoint:\n {r.url}\n", file=sys.stderr)
|
||||
print(f"HTTP Headers:\n{headers}", file=sys.stderr)
|
||||
|
||||
return responses
|
||||
|
||||
|
||||
def uploading_from_stdin(args):
|
||||
"""
|
||||
Check if the user is uploading from stdin.
|
||||
"""
|
||||
if not args.file:
|
||||
return False
|
||||
elif len(args.file) == 1 and args.file[0] == "-":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def check_if_file_arg_required(args, parser):
|
||||
required_if_no_file = [args.spreadsheet, args.file_metadata, args.status_check]
|
||||
if not args.file and not any(required_if_no_file):
|
||||
parser.error("You must specify a file to upload.")
|
||||
|
||||
|
||||
def validate_file(arg):
|
||||
if os.path.exists(arg) or arg == "-":
|
||||
return arg
|
||||
else:
|
||||
raise argparse.ArgumentTypeError(f"'{arg}' is not a valid file or directory")
|
||||
|
||||
|
||||
def main(args, parser): # noqa: PLR0912,C901
|
||||
# TODO: Refactor to deal with PLR0912 and C901
|
||||
# add type hints
|
||||
"""
|
||||
Main entry point for 'ia upload'.
|
||||
"""
|
||||
check_if_file_arg_required(args, parser)
|
||||
|
||||
if uploading_from_stdin(args) and not args.remote_name:
|
||||
parser.error("When uploading from stdin, "
|
||||
"you must specify a remote filename with --remote-name")
|
||||
|
||||
if args.status_check: # TODO: support for checking if a specific bucket is overloaded
|
||||
if args.session.s3_is_overloaded():
|
||||
print(f"warning: {args.identifier} is over limit, and not accepting requests. "
|
||||
"Expect 503 SlowDown errors.",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"success: {args.identifier} is accepting requests.", file=sys.stderr)
|
||||
sys.exit(0)
|
||||
elif args.identifier:
|
||||
item = args.session.get_item(args.identifier)
|
||||
|
||||
# Prepare upload headers and kwargs
|
||||
if args.no_derive:
|
||||
queue_derive = False
|
||||
else:
|
||||
queue_derive = True
|
||||
if args.quiet:
|
||||
verbose = False
|
||||
else:
|
||||
verbose = True
|
||||
if args.no_scanner:
|
||||
set_scanner = False
|
||||
else:
|
||||
set_scanner = True
|
||||
if args.size_hint:
|
||||
args.header["x-archive-size-hint"] = args.size_hint
|
||||
if not args.header.get("x-archive-keep-old-version") \
|
||||
and not args.no_backup:
|
||||
args.header["x-archive-keep-old-version"] = "1"
|
||||
|
||||
if args.file_metadata:
|
||||
try:
|
||||
with open(args.file_metadata) as fh:
|
||||
args.file_metadata = json.load(fh)
|
||||
except JSONDecodeError:
|
||||
args.file = []
|
||||
with open(args.file_metadata) as fh:
|
||||
for line in fh:
|
||||
j = json.loads(line.strip())
|
||||
args.file.append(j)
|
||||
|
||||
upload_kwargs = {
|
||||
"metadata": args.metadata,
|
||||
"headers": args.header,
|
||||
"debug": args.debug,
|
||||
"queue_derive": queue_derive,
|
||||
"set_scanner": set_scanner,
|
||||
"verbose": verbose,
|
||||
"verify": args.verify,
|
||||
"checksum": args.checksum,
|
||||
"retries": args.retries,
|
||||
"retries_sleep": args.sleep,
|
||||
"delete": args.delete,
|
||||
"validate_identifier": True,
|
||||
}
|
||||
|
||||
# Upload files
|
||||
errors = False
|
||||
if not args.spreadsheet:
|
||||
if uploading_from_stdin(args):
|
||||
local_file = TemporaryFile()
|
||||
# sys.stdin normally has the buffer attribute which returns bytes.
|
||||
# However, this might not always be the case, e.g. on mocking for test purposes.
|
||||
# Fall back to reading as str and encoding back to bytes.
|
||||
# Note that the encoding attribute might also be None. In that case, fall back to
|
||||
# locale.getpreferredencoding, the default of io.TextIOWrapper and open().
|
||||
if hasattr(sys.stdin, "buffer"):
|
||||
def read():
|
||||
return sys.stdin.buffer.read(1048576)
|
||||
else:
|
||||
encoding = sys.stdin.encoding or getpreferredencoding(False)
|
||||
|
||||
def read():
|
||||
return sys.stdin.read(1048576).encode(encoding)
|
||||
while True:
|
||||
data = read()
|
||||
if not data:
|
||||
break
|
||||
local_file.write(data)
|
||||
local_file.seek(0)
|
||||
else:
|
||||
local_file = args.file
|
||||
# Properly expand a period to the contents of the current working directory.
|
||||
if isinstance(local_file, str) and "." in local_file:
|
||||
local_file = [p for p in local_file if p != "."]
|
||||
local_file = os.listdir(".") + local_file
|
||||
|
||||
if isinstance(local_file, (list, tuple, set)) and args.remote_name:
|
||||
local_file = local_file[0]
|
||||
if args.remote_name:
|
||||
files = {args.remote_name: local_file}
|
||||
elif args.keep_directories:
|
||||
files = {f: f for f in local_file}
|
||||
else:
|
||||
files = local_file
|
||||
|
||||
for _r in _upload_files(item, files, upload_kwargs):
|
||||
if args.debug:
|
||||
break
|
||||
|
||||
# Check if Response is empty first (i.e. --checksum)
|
||||
# TODO: Should upload return something other than an empty Response
|
||||
# object if checksum is set and the file is already in the item?
|
||||
if _r.status_code is None:
|
||||
pass
|
||||
elif not _r.ok:
|
||||
errors = True
|
||||
else:
|
||||
if args.open_after_upload:
|
||||
url = f"{args.session.protocol}//{args.session.host}/details/{item.identifier}"
|
||||
webbrowser.open_new_tab(url)
|
||||
|
||||
# Bulk upload using spreadsheet.
|
||||
else:
|
||||
# Use the same session for each upload request.
|
||||
with args.spreadsheet as csvfp:
|
||||
spreadsheet = csv.DictReader(csvfp)
|
||||
prev_identifier = None
|
||||
for row in spreadsheet:
|
||||
for metadata_key in row:
|
||||
if not is_valid_metadata_key(metadata_key):
|
||||
print(f"error: '{metadata_key}' is not a valid metadata key.",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
upload_kwargs_copy = deepcopy(upload_kwargs)
|
||||
if row.get("REMOTE_NAME"):
|
||||
local_file = {row["REMOTE_NAME"]: row["file"]}
|
||||
del row["REMOTE_NAME"]
|
||||
elif args.keep_directories:
|
||||
local_file = {row["file"]: row["file"]}
|
||||
else:
|
||||
local_file = row["file"]
|
||||
identifier = row.get("item", row.get("identifier"))
|
||||
if not identifier:
|
||||
if not prev_identifier:
|
||||
print("error: no identifier column on spreadsheet.",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
identifier = prev_identifier
|
||||
del row["file"]
|
||||
if "identifier" in row:
|
||||
del row["identifier"]
|
||||
if "item" in row:
|
||||
del row["item"]
|
||||
item = args.session.get_item(identifier)
|
||||
# TODO: Clean up how indexed metadata items are coerced
|
||||
# into metadata.
|
||||
md_args = [f"{k.lower()}:{v}" for (k, v) in row.items() if v]
|
||||
metadata = get_args_dict(md_args)
|
||||
upload_kwargs_copy["metadata"].update(metadata)
|
||||
r = _upload_files(item, local_file, upload_kwargs_copy, prev_identifier)
|
||||
for _r in r:
|
||||
if args.debug:
|
||||
break
|
||||
if (not _r.status_code) or (not _r.ok):
|
||||
errors = True
|
||||
else:
|
||||
if args.open_after_upload:
|
||||
url = (f"{args.session.protocol}//{args.session.host}"
|
||||
"/details/{identifier}")
|
||||
webbrowser.open_new_tab(url)
|
||||
prev_identifier = identifier
|
||||
|
||||
if errors:
|
||||
sys.exit(1)
|
Loading…
Add table
Add a link
Reference in a new issue