Post-Local sync at 2025-06-23T22:46:07Z

This commit is contained in:
Mark Randall Havens 2025-06-23 17:55:02 -05:00
parent 9d33b42020
commit 9f97801b0d
1387 changed files with 250216 additions and 117 deletions

View file

@ -0,0 +1,58 @@
#
# The internetarchive module is a Python/CLI interface to Archive.org.
#
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
internetarchive.cli
~~~~~~~~~~~~~~~~~~~
:copyright: (C) 2012-2024 by Internet Archive.
:license: AGPL 3, see LICENSE for more details.
"""
from internetarchive.cli import (
cli_utils,
ia,
ia_account,
ia_configure,
ia_copy,
ia_delete,
ia_download,
ia_list,
ia_metadata,
ia_move,
ia_reviews,
ia_search,
ia_tasks,
ia_upload,
)
__all__ = [
"cli_utils",
"ia",
"ia_account",
"ia_configure",
"ia_copy",
"ia_delete",
"ia_download",
"ia_list",
"ia_metadata",
"ia_move",
"ia_reviews",
"ia_search",
"ia_tasks",
"ia_upload",
]

View file

@ -0,0 +1,210 @@
"""
interneratchive.cli.cli_utils
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import json
import os
import signal
import sys
from collections import defaultdict
from collections.abc import Iterable
from typing import Mapping
from urllib.parse import parse_qsl
from internetarchive.utils import InvalidIdentifierException, validate_s3_identifier
def get_args_dict(args: list[str],
query_string: bool = False,
header: bool = False) -> dict:
args = args or []
if not isinstance(args, list):
args = [args]
metadata: dict[str, list | str] = defaultdict(list)
for md in args:
if query_string:
if (":" in md) and ("=" not in md):
md = md.replace(":", "=").replace(";", "&")
for key, value in parse_qsl(md):
assert value
metadata[key] = value
else:
key, value = md.split(":", 1)
assert value
if value not in metadata[key]:
metadata[key].append(value) # type: ignore
for key in metadata: # noqa: PLC0206
# Flatten single item lists.
if len(metadata[key]) <= 1:
metadata[key] = metadata[key][0]
return metadata
def convert_str_list_to_unicode(str_list: list[bytes]):
encoding = sys.getfilesystemencoding()
return [b.decode(encoding) for b in str_list]
def validate_identifier(identifier):
try:
validate_s3_identifier(identifier)
except InvalidIdentifierException as e:
raise argparse.ArgumentTypeError(str(e))
return identifier
def flatten_list(lst):
"""Flatten a list if it contains lists."""
result = []
for item in lst:
if isinstance(item, Iterable) and not isinstance(item, str):
result.extend(flatten_list(item)) # Recursively flatten
else:
result.append(item) # Just append the item if it's not a list
return result
class FlattenListAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# Flatten the list of values (if nested)
flattened = flatten_list(values)
# Initialize the attribute if it doesn't exist yet
if getattr(namespace, self.dest, None) is None:
setattr(namespace, self.dest, [])
# Append the flattened list to the existing attribute
getattr(namespace, self.dest).extend(flattened)
class PostDataAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
current_value = getattr(namespace, self.dest, None)
# Split values into individual JSON objects (if needed) and parse them
all_values = []
for value in values:
try:
obj = json.loads(value)
all_values.append(obj)
except json.JSONDecodeError as e:
parser.error(f"Invalid JSON format for post data: {value}")
# If there is no current value (first argument), initialize it as an object or list
if current_value is None:
# If there's only one value, don't wrap it in a list
if len(all_values) == 1:
post_data = all_values[0]
else:
post_data = all_values
elif isinstance(current_value, list):
# If it's already a list, append the new values to it
post_data = current_value + all_values
else:
# If it's a single object (first argument), convert it into a list and append new data
post_data = [current_value] + all_values
# Set the final value back to the namespace
setattr(namespace, self.dest, post_data)
class QueryStringAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# Initialize the destination as an empty dictionary if it doesn't exist
if getattr(namespace, self.dest, None) is None:
setattr(namespace, self.dest, {})
for sublist in values:
if "=" not in sublist and ":" in sublist:
sublist = sublist.replace(":", "=", 1)
key_value_pairs = parse_qsl(sublist)
if sublist and not key_value_pairs:
parser.error(f"{option_string} must be formatted as 'key=value' "
"or 'key:value'")
for key, value in key_value_pairs:
current_dict = getattr(namespace, self.dest)
if key in current_dict:
current_dict[key].append(value)
else:
current_dict[key] = [value]
current_dict = getattr(namespace, self.dest)
for key, value in current_dict.items():
if len(value) == 1:
current_dict[key] = value[0]
class MetadataAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# Initialize the destination as an empty dictionary if it doesn't exist
if getattr(namespace, self.dest, None) is None:
setattr(namespace, self.dest, {})
for sublist in values:
if ":" not in sublist and "=" in sublist:
sublist = sublist.replace("=", ":", 1)
try:
key, value = sublist.split(":", 1)
except ValueError:
parser.error(f"{option_string} must be formatted as 'KEY:VALUE'")
current_dict = getattr(namespace, self.dest)
if key in current_dict:
if not isinstance(current_dict[key], list):
current_dict[key] = [current_dict[key]]
current_dict[key].append(value)
else:
current_dict[key] = value
def validate_dir_path(path):
"""
Check if the given path is a directory that exists.
Args:
path (str): The path to check.
Returns:
str: The validated directory path.
Raises:
argparse.ArgumentTypeError: If the path is not a valid directory.
"""
if os.path.isdir(path):
return path
else:
raise argparse.ArgumentTypeError(f"'{path}' is not a valid directory")
def exit_on_signal(sig, frame):
"""
Exit the program cleanly upon receiving a specified signal.
This function is designed to be used as a signal handler. When a signal
(such as SIGINT or SIGPIPE) is received, it exits the program with an
exit code of 128 plus the signal number. This convention helps to
distinguish between regular exit codes and those caused by signals.
"""
exit_code = 128 + sig
sys.exit(exit_code)

View file

@ -0,0 +1,154 @@
#!/usr/bin/env python
"""
ia.py
The internetarchive module is a Python/CLI interface to Archive.org.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import signal
import sys
from internetarchive import __version__, get_session
from internetarchive.cli import (
ia_account,
ia_configure,
ia_copy,
ia_delete,
ia_download,
ia_flag,
ia_list,
ia_metadata,
ia_move,
ia_reviews,
ia_search,
ia_simplelists,
ia_tasks,
ia_upload,
)
from internetarchive.cli.cli_utils import exit_on_signal
# Handle broken pipe
try:
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
except AttributeError:
# Non-unix support
pass
# Handle <Ctrl-C>
signal.signal(signal.SIGINT, exit_on_signal)
def validate_config_path(path):
"""
Validate the path to the configuration file.
Returns:
str: Validated path to the configuration file.
"""
if "configure" not in sys.argv: # Support for adding config to specific file
file_check = argparse.FileType("r")
file_check(path)
return path
def main():
"""
Main entry point for the CLI.
"""
parser = argparse.ArgumentParser(
description="A command line interface to Archive.org.",
epilog=("Documentation for 'ia' is available at:\n\n\t"
"https://archive.org/developers/internetarchive/cli.html\n\n"
"See 'ia {command} --help' for help on a specific command."),
formatter_class=argparse.RawTextHelpFormatter) # support for \n in epilog
parser.add_argument("-v", "--version",
action="version",
version=__version__)
parser.add_argument("-c", "--config-file",
action="store",
type=validate_config_path,
metavar="FILE",
help="path to configuration file")
parser.add_argument("-l", "--log",
action="store_true",
default=False,
help="enable logging")
parser.add_argument("-d", "--debug",
action="store_true",
help="enable debugging")
parser.add_argument("-i", "--insecure",
action="store_true",
help="allow insecure connections")
parser.add_argument("-H", "--host",
action="store",
help=("host to connect to "
"(doesn't work for requests made to s3.us.archive.org)"))
subparsers = parser.add_subparsers(title="commands",
dest="command",
metavar="{command}")
# Add subcommand parsers
ia_account.setup(subparsers)
ia_configure.setup(subparsers)
ia_copy.setup(subparsers)
ia_delete.setup(subparsers)
ia_download.setup(subparsers)
ia_flag.setup(subparsers)
ia_list.setup(subparsers)
ia_metadata.setup(subparsers)
ia_move.setup(subparsers)
ia_reviews.setup(subparsers)
ia_search.setup(subparsers)
ia_simplelists.setup(subparsers)
ia_tasks.setup(subparsers)
ia_upload.setup(subparsers)
# Suppress help for alias subcommands
args = parser.parse_args()
config: dict[str, dict] = {}
if args.log:
config["logging"] = {"level": "INFO"}
elif args.debug:
config["logging"] = {"level": "DEBUG"}
if args.insecure:
config["general"] = {"secure": False}
if args.host:
if config.get("general"):
config["general"]["host"] = args.host
else:
config["general"] = {"host": args.host}
args.session = get_session(config_file=args.config_file,
config=config,
debug=args.debug)
# Check if any arguments were provided
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args.func(args)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,110 @@
"""
ia_account.py
'ia' subcommand for configuring 'ia' with your archive.org credentials.
"""
# Copyright (C) 2012-2025 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import json
import sys
from internetarchive import configure
from internetarchive.account import Account
from internetarchive.exceptions import AccountAPIError
from internetarchive.utils import is_valid_email
def setup(subparsers):
"""
Setup args for configure command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("account",
aliases=["ac"],
description=(
"Manage an archive.org account.\n\n"
"Note: This command requires administrative "
"privileges. "
),
help=("Manage an archive.org account. "
"Note: requires admin privileges"))
group = parser.add_mutually_exclusive_group()
parser.add_argument("user",
help="Email address, screenname, or itemname "
"for an archive.org account")
group.add_argument("-g", "--get-email",
action="store_true",
help="Print the email address associated with the user and exit")
group.add_argument("-s", "--get-screenname",
action="store_true",
help="Print the screenname associated with the user and exit")
group.add_argument("-i", "--get-itemname",
action="store_true",
help="Print the itemname associated with the user and exit")
group.add_argument("-l", "--is-locked",
action="store_true",
help="Check if an account is locked")
group.add_argument("-L", "--lock",
action="store_true",
help="Lock an account")
group.add_argument("-u", "--unlock",
action="store_true",
help="Unlock an account")
parser.add_argument("-c", "--comment",
type=str,
help="Comment to include with lock/unlock action")
parser.set_defaults(func=main)
def main(args: argparse.Namespace) -> None:
"""
Main entrypoint for 'ia account'.
"""
try:
if args.user.startswith('@'):
account = Account.from_account_lookup('itemname', args.user)
elif not is_valid_email(args.user):
account = Account.from_account_lookup('screenname', args.user)
else:
account = Account.from_account_lookup('email', args.user)
except AccountAPIError as exc:
print(json.dumps(exc.error_data))
sys.exit(1)
if args.get_email:
print(account.canonical_email)
elif args.get_screenname:
print(account.screenname)
elif args.get_itemname:
print(account.itemname)
elif args.is_locked:
print(account.locked)
elif args.lock:
r = account.lock(args.comment, session=args.session)
print(r.text)
elif args.unlock:
r = account.unlock(args.comment, session=args.session)
print(r.text)
else:
account_data = account.to_dict()
print(json.dumps(account_data))

View file

@ -0,0 +1,179 @@
"""
ia_configure.py
'ia' subcommand for configuring 'ia' with your archive.org credentials.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import json
import netrc
import sys
from internetarchive import configure
from internetarchive.exceptions import AuthenticationError
def setup(subparsers):
"""
Setup args for configure command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("configure",
aliases=["co"],
help=("configure 'ia' with your "
"archive.org credentials"))
config_action_group = parser.add_mutually_exclusive_group()
parser.add_argument("--username", "-u",
help=("provide username as an option rather than "
"providing it interactively"))
parser.add_argument("--password", "-p",
help=("provide password as an option rather than "
"providing it interactively"))
parser.add_argument("--netrc", "-n",
action="store_true",
help="use netrc file for login")
config_action_group.add_argument("--show", "-s",
action="store_true",
help=("print the current configuration in JSON format, "
"redacting secrets and cookies"))
config_action_group.add_argument("--check", "-C",
action="store_true",
help="validate IA-S3 keys (exits 0 if valid, 1 otherwise)")
config_action_group.add_argument("--whoami", "-w",
action="store_true",
help=("uses your IA-S3 keys to retrieve account "
"information from archive.org "
"about the associated account"))
parser.add_argument("--print-cookies", "-c",
action="store_true",
help="print archive.org logged-in-* cookies")
parser.add_argument("--print-auth-header", "-a",
action="store_true",
help="print an Authorization header with your IA-S3 keys")
parser.set_defaults(func=main)
def main(args: argparse.Namespace) -> None:
"""
Main entrypoint for 'ia configure'.
"""
if args.print_auth_header:
secret = args.session.config.get("s3", {}).get("secret")
access = args.session.config.get("s3", {}).get("access")
if not secret or not access:
print('hi')
if not access:
print("error: 'access' key not found in config file, try reconfiguring.",
file=sys.stderr)
elif not secret:
print("error: 'secret' key not found in config file, try reconfiguring.",
file=sys.stderr)
sys.exit(1)
print(f"Authorization: LOW {access}:{secret}")
sys.exit()
if args.print_cookies:
user = args.session.config.get("cookies", {}).get("logged-in-user")
sig = args.session.config.get("cookies", {}).get("logged-in-sig")
if not user or not sig:
if not user and not sig:
print("error: 'logged-in-user' and 'logged-in-sig' cookies "
"not found in config file, try reconfiguring.", file=sys.stderr)
elif not user:
print("error: 'logged-in-user' cookie not found in config file, "
"try reconfiguring.", file=sys.stderr)
elif not sig:
print("error: 'logged-in-sig' cookie not found in config file, "
"try reconfiguring.", file=sys.stderr)
sys.exit(1)
print(f"logged-in-user={user}; logged-in-sig={sig}")
sys.exit()
if args.show:
config = args.session.config.copy()
# Redact S3 secret
if 's3' in config:
s3_config = config['s3'].copy()
if 'secret' in s3_config:
s3_config['secret'] = 'REDACTED'
config['s3'] = s3_config
# Redact logged-in-secret cookie
if 'cookies' in config:
cookies = config['cookies'].copy()
if 'logged-in-sig' in cookies:
cookies['logged-in-sig'] = 'REDACTED'
config['cookies'] = cookies
print(json.dumps(config))
sys.exit()
if args.whoami:
whoami_info = args.session.whoami()
print(json.dumps(whoami_info))
sys.exit()
if args.check:
whoami_info = args.session.whoami()
if whoami_info.get('success') is True:
user = whoami_info['value']['username']
print(f'The credentials for "{user}" are valid')
sys.exit(0)
else:
print('Your credentials are invalid, check your configuration and try again')
sys.exit(1)
try:
# Netrc
if args.netrc:
print("Configuring 'ia' with netrc file...", file=sys.stderr)
try:
n = netrc.netrc()
except netrc.NetrcParseError:
print("error: netrc.netrc() cannot parse your .netrc file.",
file=sys.stderr)
sys.exit(1)
except FileNotFoundError:
print("error: .netrc file not found.", file=sys.stderr)
sys.exit(1)
username, _, password = n.hosts["archive.org"]
config_file_path = configure(username,
password or "",
config_file=args.session.config_file,
host=args.session.host)
print(f"Config saved to: {config_file_path}", file=sys.stderr)
# Interactive input.
else:
if not (args.username and args.password):
print("Enter your Archive.org credentials below to configure 'ia'.\n")
config_file_path = configure(args.username,
args.password,
config_file=args.session.config_file,
host=args.session.host)
saved_msg = f"Config saved to: {config_file_path}"
if not all([args.username, args.password]):
saved_msg = f"\n{saved_msg}"
print(saved_msg)
except AuthenticationError as exc:
print(f"\nerror: {exc}", file=sys.stderr)
sys.exit(1)

View file

@ -0,0 +1,164 @@
"""
ia_copy.py
'ia' subcommand for copying files on archive.org
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import sys
from typing import Optional
from urllib.parse import quote
from requests import Response
import internetarchive as ia
from internetarchive.cli.cli_utils import MetadataAction, QueryStringAction
from internetarchive.utils import get_s3_xml_text, merge_dictionaries
def setup(subparsers):
"""
Setup args for copy command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("copy",
aliases=["cp"],
help="Copy files from archive.org items")
# Positional arguments
parser.add_argument("source",
metavar="SOURCE",
help="Source file formatted as: identifier/file")
parser.add_argument("destination",
metavar="DESTINATION",
help="Destination file formatted as: identifier/file")
# Options
parser.add_argument("-m", "--metadata",
metavar="KEY:VALUE",
nargs="+",
default={},
action=MetadataAction,
help=("Metadata to add to your new item, if you are moving the "
"file to a new item"))
parser.add_argument("--replace-metadata",
action="store_true",
help=("Only use metadata specified as argument, do not copy any "
"from the source item"))
parser.add_argument("-H", "--header",
metavar="KEY:VALUE",
nargs="+",
default={},
action=QueryStringAction,
help="S3 HTTP headers to send with your request")
parser.add_argument("--ignore-file-metadata",
action="store_true",
help="Do not copy file metadata")
parser.add_argument("-n", "--no-derive",
action="store_true",
help="Do not derive uploaded files")
parser.add_argument("--no-backup",
action="store_true",
help=("Turn off archive.org backups, "
"clobbered files will not be saved to "
"'history/files/$key.~N~'"))
parser.set_defaults(func=lambda args: main(args, "copy", parser))
def assert_src_file_exists(src_location: str) -> bool:
"""
Assert that the source file exists on archive.org.
"""
assert SRC_ITEM.exists # type: ignore
global SRC_FILE
src_filename = src_location.split("/", 1)[-1]
SRC_FILE = SRC_ITEM.get_file(src_filename) # type: ignore
assert SRC_FILE.exists # type: ignore
return True
def main(args: argparse.Namespace,
cmd: str,
parser: argparse.ArgumentParser) -> tuple[Response, ia.files.File | None]:
"""
Main entry point for 'ia copy'.
"""
SRC_FILE = None
if args.source == args.destination:
parser.error("error: The source and destination files cannot be the same!")
global SRC_ITEM
SRC_ITEM = args.session.get_item(args.source.split("/")[0]) # type: ignore
SRC_FILE = SRC_ITEM.get_file(args.source.split("/",1)[-1]) # type: ignore
try:
assert_src_file_exists(args.source)
except AssertionError:
parser.error(f"error: https://{args.session.host}/download/{args.source} "
"does not exist. Please check the "
"identifier and filepath and retry.")
args.header["x-amz-copy-source"] = f"/{quote(args.source)}"
# Copy the old metadata verbatim if no additional metadata is supplied,
# else combine the old and the new metadata in a sensible manner.
if args.metadata or args.replace_metadata:
args.header["x-amz-metadata-directive"] = "REPLACE"
else:
args.header["x-amz-metadata-directive"] = "COPY"
# New metadata takes precedence over old metadata.
if not args.replace_metadata:
args.metadata = merge_dictionaries(SRC_ITEM.metadata, # type: ignore
args.metadata)
# File metadata is copied by default but can be dropped.
file_metadata = None if args.ignore_file_metadata else SRC_FILE.metadata # type: ignore
# Add keep-old-version by default.
if not args.header.get("x-archive-keep-old-version") and not args.no_backup:
args.header["x-archive-keep-old-version"] = "1"
url = f"{args.session.protocol}//s3.us.archive.org/{quote(args.destination)}"
queue_derive = not args.no_derive
req = ia.iarequest.S3Request(url=url,
method="PUT",
metadata=args.metadata,
file_metadata=file_metadata,
headers=args.header,
queue_derive=queue_derive,
access_key=args.session.access_key,
secret_key=args.session.secret_key)
p = req.prepare()
r = args.session.send(p)
if r.status_code != 200:
try:
msg = get_s3_xml_text(r.text)
except Exception as e:
msg = r.text
print(f"error: failed to {cmd} '{args.source}' to '{args.destination}' - {msg}",
file=sys.stderr)
sys.exit(1)
elif cmd == "copy":
print(f"success: copied '{args.source}' to '{args.destination}'.",
file=sys.stderr)
return (r, SRC_FILE)

View file

@ -0,0 +1,182 @@
"""
ia_delete.py
'ia' subcommand for deleting files from archive.org items.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import sys
import requests.exceptions
from internetarchive.cli.cli_utils import (
FlattenListAction,
MetadataAction,
QueryStringAction,
validate_identifier,
)
from internetarchive.utils import get_s3_xml_text
def setup(subparsers):
"""
Setup args for delete command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("delete",
aliases=["rm"],
help="Delete files from archive.org items")
# Positional arguments
parser.add_argument("identifier",
type=validate_identifier,
help="Identifier for the item from which files are to be deleted.")
parser.add_argument("file",
type=str,
nargs="*",
help="Specific file(s) to delete.")
# Optional arguments
parser.add_argument("-q", "--quiet",
action="store_true",
help="Print status to stdout.")
parser.add_argument("-c", "--cascade",
action="store_true",
help="Delete all associated files including derivatives and the original.")
parser.add_argument("-H", "--header",
nargs="+",
action=QueryStringAction,
default={},
metavar="KEY:VALUE",
help="S3 HTTP headers to send with your request.")
parser.add_argument("-a", "--all",
action="store_true",
help="Delete all files in the given item. Some files cannot be deleted.")
parser.add_argument("-d", "--dry-run",
action="store_true",
help=("Output files to be deleted to stdout, "
"but don't actually delete them."))
parser.add_argument("-g", "--glob",
type=str,
help="Only delete files matching the given pattern.")
parser.add_argument("-f", "--format",
type=str,
nargs="+",
action=FlattenListAction,
help="Only delete files matching the specified formats.")
parser.add_argument("-R", "--retries",
type=int,
default=2,
help="Number of retries on S3 503 SlowDown error.")
parser.add_argument("--no-backup",
action="store_true",
help="Turn off archive.org backups. Clobbered files will not be saved.")
parser.set_defaults(func=lambda args: main(args, parser))
def get_files_to_delete(args: argparse.Namespace, item) -> list:
"""Get files to delete based on command-line arguments."""
if args.all:
files = list(item.get_files())
args.cascade = True
elif args.glob:
files = item.get_files(glob_pattern=args.glob)
elif args.format:
files = item.get_files(formats=args.format)
else:
fnames = [f.strip() for f in (sys.stdin if args.file == ["-"] else args.file)]
files = list(item.get_files(fnames))
return files
def delete_files(files, args, item, verbose):
"""
Deletes files from an item.
Args:
files (list): A list of files to delete.
args (argparse.Namespace): Parsed command-line arguments.
item: The item from which files are being deleted.
verbose (bool): If True, verbose output is enabled.
Returns:
bool: True if errors occurred during deletion, False otherwise.
"""
errors = False
# Files that cannot be deleted via S3.
no_delete = ["_meta.xml", "_files.xml", "_meta.sqlite"]
for f in files:
if not f:
if verbose:
print(f" error: '{f.name}' does not exist", file=sys.stderr)
errors = True
continue
if any(f.name.endswith(s) for s in no_delete):
continue
if args.dry_run:
print(f" will delete: {item.identifier}/{f.name}", file=sys.stderr)
continue
try:
resp = f.delete(verbose=verbose,
cascade_delete=args.cascade,
headers=args.header,
retries=args.retries)
except requests.exceptions.RetryError:
print(f" error: max retries exceeded for {f.name}", file=sys.stderr)
errors = True
continue
if resp.status_code != 204:
errors = True
msg = get_s3_xml_text(resp.content)
print(f" error: {msg} ({resp.status_code})", file=sys.stderr)
continue
return errors
def main(args: argparse.Namespace, parser: argparse.ArgumentParser):
"""
Main entry point for 'ia delete'.
"""
verbose = not args.quiet
item = args.session.get_item(args.identifier)
if not item.exists:
print(f"{item.identifier}: skipping, item doesn't exist.", file=sys.stderr)
return
# Add keep-old-version by default.
if "x-archive-keep-old-version" not in args.header and not args.no_backup:
args.header["x-archive-keep-old-version"] = "1"
if verbose:
print(f"Deleting files from {item.identifier}", file=sys.stderr)
files = get_files_to_delete(args, item)
if not files:
print(" warning: no files found, nothing deleted.", file=sys.stderr)
sys.exit(1)
errors = delete_files(files, args, item, verbose)
if errors:
sys.exit(1)

View file

@ -0,0 +1,238 @@
"""
ia_download.py
'ia' subcommand for downloading files from archive.org.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import sys
from typing import TextIO
from internetarchive.cli.cli_utils import (
QueryStringAction,
validate_dir_path,
validate_identifier,
)
from internetarchive.files import File
from internetarchive.search import Search
def setup(subparsers):
"""
Setup args for download command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("download",
aliases=["do"],
help="Retrieve and modify archive.org item metadata")
# Main options
parser.add_argument("identifier",
nargs="?",
type=validate_identifier,
help="Identifier for the upload")
parser.add_argument("file",
nargs="*",
help="Files to download")
# Additional options
parser.add_argument("-q", "--quiet",
action="store_true",
help="Turn off ia's output [default: False]")
parser.add_argument("-d", "--dry-run",
action="store_true",
help="Print URLs to stdout and exit")
parser.add_argument("-i", "--ignore-existing",
action="store_true",
help="Clobber files already downloaded")
parser.add_argument("-C", "--checksum",
action="store_true",
help="Skip files based on checksum [default: False]")
parser.add_argument("--checksum-archive",
action="store_true",
help="Skip files based on _checksum_archive.txt file")
parser.add_argument("-R", "--retries",
type=int,
default=5,
help="Set number of retries to <retries> [default: 5]")
parser.add_argument("-I", "--itemlist",
type=argparse.FileType("r"),
help=("Download items from a specified file. "
"Itemlists should be a plain text file with one "
"identifier per line"))
parser.add_argument("-S", "--search",
help="Download items returned from a specified search query")
parser.add_argument("-P", "--search-parameters",
nargs="+",
action=QueryStringAction,
metavar="KEY:VALUE",
help="Parameters to send with your --search query")
parser.add_argument("-g", "--glob",
help=("Only download files whose filename matches "
"the given glob pattern"))
parser.add_argument("-e", "--exclude",
help=("Exclude files whose filename matches "
"the given glob pattern"))
parser.add_argument("-f", "--format",
nargs="+",
help=("Only download files of the specified format. "
"Use this option multiple times to download "
"multiple formats. You can use the following command to "
"retrieve a list of file formats contained within a "
"given item: ia metadata --formats <identifier>"))
parser.add_argument("--on-the-fly",
action="store_true",
help=("Download on-the-fly files, as well as other "
"matching files. on-the-fly files include derivative "
"EPUB, MOBI and DAISY files [default: False]"))
parser.add_argument("--no-directories",
action="store_true",
help=("Download files into working directory. "
"Do not create item directories"))
parser.add_argument("--destdir",
type=validate_dir_path,
help=("The destination directory to download files "
"and item directories to"))
parser.add_argument("-s", "--stdout",
action="store_true",
help="Write file contents to stdout")
parser.add_argument("--no-change-timestamp",
action="store_true",
help=("Don't change the timestamp of downloaded files to reflect "
"the source material"))
parser.add_argument("-p", "--parameters",
nargs="+",
action=QueryStringAction,
metavar="KEY:VALUE",
help="Parameters to send with your download request (e.g. `cnt=0`)")
parser.add_argument("-a", "--download-history",
action="store_true",
help="Also download files from the history directory")
parser.add_argument("--source",
nargs="+",
help=("Filter files based on their source value in files.xml "
"(i.e. `original`, `derivative`, `metadata`)"))
parser.add_argument("--exclude-source",
nargs="+",
help=("Filter files based on their source value in files.xml "
"(i.e. `original`, `derivative`, `metadata`)"))
parser.add_argument("-t", "--timeout",
type=float,
help=("Set a timeout for download requests. "
"This sets both connect and read timeout"))
parser.set_defaults(func=lambda args: main(args, parser))
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""
Main entry point for 'ia download'.
"""
ids: list[File | str] | Search | TextIO
if args.itemlist:
ids = [x.strip() for x in args.itemlist]
total_ids = len(ids)
elif args.search:
try:
_search = args.session.search_items(args.search,
params=args.search_parameters)
total_ids = _search.num_found
if total_ids == 0:
print(f"error: the query '{args.search}' returned no results", file=sys.stderr)
sys.exit(1)
ids = _search
except ValueError as e:
print(f"error: {e}", file=sys.stderr)
sys.exit(1)
# Download specific files.
if args.identifier and args.identifier != "-":
if "/" in args.identifier:
identifier = args.identifier.split("/")[0]
files = ["/".join(args.identifier.split("/")[1:])]
else:
identifier = args.identifier
files = args.file
total_ids = 1
ids = [identifier]
elif args.identifier == "-":
total_ids = 1
ids = sys.stdin
files = None
else:
files = None
errors = []
for i, identifier in enumerate(ids):
try:
identifier = identifier.strip()
except AttributeError:
identifier = identifier.get("identifier")
if total_ids > 1:
item_index = f"{i + 1}/{total_ids}"
else:
item_index = None
try:
item = args.session.get_item(identifier)
except Exception as exc:
print(f"{identifier}: failed to retrieve item metadata - errors", file=sys.stderr)
if "You are attempting to make an HTTPS" in str(exc):
print(f"\n{exc}", file=sys.stderr)
sys.exit(1)
else:
continue
# Otherwise, download the entire item.
ignore_history_dir = bool(args.download_history)
_errors = item.download(
files=files,
formats=args.format,
glob_pattern=args.glob,
exclude_pattern=args.exclude,
dry_run=args.dry_run,
verbose=not args.quiet,
ignore_existing=args.ignore_existing,
checksum=args.checksum,
checksum_archive=args.checksum_archive,
destdir=args.destdir,
no_directory=args.no_directories,
retries=args.retries,
item_index=item_index,
ignore_errors=True,
on_the_fly=args.on_the_fly,
no_change_timestamp=args.no_change_timestamp,
params=args.parameters,
ignore_history_dir=ignore_history_dir,
source=args.source,
exclude_source=args.exclude_source,
stdout=args.stdout,
timeout=args.timeout,
)
if _errors:
errors.append(_errors)
if errors:
# TODO: add option for a summary/report.
sys.exit(1)
else:
sys.exit(0)

View file

@ -0,0 +1,103 @@
"""
ia_flag.py
'ia' subcommand for managing flags on archive.org.
"""
# Copyright (C) 2012-2025 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
def setup(subparsers):
"""Set up argument parser for the 'flag' subcommand.
Args:
subparsers: argparse subparsers object from main CLI
"""
parser = subparsers.add_parser(
"flag",
aliases=["fl"],
help="Manage flags",
)
parser.add_argument(
"identifier",
nargs="?",
type=str,
help="Identifier for the upload",
)
parser.add_argument(
"-u",
"--user",
type=str,
help="User associated with the flag",
)
group = parser.add_argument_group("Add flag operations")
group.add_argument(
"-a",
"--add-flag",
metavar="CATEGORY",
type=str,
help="Add a flag to the item",
)
group = parser.add_argument_group("Delete flag operations")
group.add_argument(
"-d",
"--delete-flag",
metavar="CATEGORY",
type=str,
help="Delete a flag from the item",
)
parser.set_defaults(func=lambda args: main(args, parser))
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""Handle flag subcommand execution.
Args:
args: Parsed command-line arguments
parser: Argument parser for error handling
"""
item = args.session.get_item(args.identifier)
if args.user:
flag_user = args.user
else:
flag_user = args.session.config.get("general", {}).get("screenname")
if not flag_user.startswith('@'):
flag_user = f"@{flag_user}"
if args.add_flag:
r = item.add_flag(args.add_flag, flag_user)
j = r.json()
if j.get("status") == "success":
print(f"success: added '{args.add_flag}' flag by {flag_user} to {args.identifier}")
else:
print(f"error: {item.identifier} - {r.text}")
elif args.delete_flag:
r = item.delete_flag(args.delete_flag, flag_user)
j = r.json()
if j.get("status") == "success":
print(f"success: deleted '{args.delete_flag}' flag by {flag_user} from {args.identifier}")
else:
print(f"error: {item.identifier} - {r.text}")
else:
r = item.get_flags()
print(r.text)

View file

@ -0,0 +1,151 @@
"""
ia_list.py
'ia' subcommand for listing files from archive.org items.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import csv
import sys
from fnmatch import fnmatch
from itertools import chain
from internetarchive.cli.cli_utils import validate_identifier
def setup(subparsers):
"""
Setup args for list command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("list",
aliases=["ls"],
help="list files from archive.org items")
# Positional arguments
parser.add_argument("identifier",
type=validate_identifier,
help="identifier of the item")
# Options
parser.add_argument("-v", "--verbose",
action="store_true",
help="print column headers")
parser.add_argument("-a", "--all",
action="store_true",
help="list all information available for files")
parser.add_argument("-l", "--location",
action="store_true",
help="print full URL for each file")
parser.add_argument("-c", "--columns",
action="append",
type=prepare_columns,
help="list specified file information")
parser.add_argument("-g", "--glob",
help="only return files matching the given pattern")
parser.add_argument("-f", "--format",
action="append",
help="return files matching FORMAT")
parser.set_defaults(func=main)
def prepare_columns(columns):
"""
Validate the path to the configuration file.
Returns:
str: Validated list of columns
"""
if columns:
if not isinstance(columns, list):
columns = [columns]
return list(chain.from_iterable([c.split(",") for c in columns]))
return None
def setup_columns(args, files):
"""
Setup and adjust columns for output based on args.
"""
if not args.columns:
args.columns = ["name"]
else:
args.columns = list(chain.from_iterable(args.columns))
if args.all:
args.columns = list(set(chain.from_iterable(k for k in files)))
# Make "name" the first column always.
if "name" in args.columns:
args.columns.remove("name")
args.columns.insert(0, "name")
def filter_files(args, files, item):
"""
Filter files based on glob patterns or formats.
"""
if args.glob:
patterns = args.glob.split("|")
return [f for f in files if any(fnmatch(f["name"], p) for p in patterns)]
if args.format:
return [f.__dict__ for f in item.get_files(formats=args.format)]
return files
def generate_output(files, args, dict_writer, item):
"""
Generate and write output based on filtered files and columns.
"""
output = []
for f in files:
file_dict = {}
for key, val in f.items():
if key in args.columns:
if isinstance(val, (list, tuple, set)):
val = ";".join(val)
if key == "name" and args.location:
file_dict[key] = (f"https://{args.session.host}"
f"/download/{item.identifier}/{val}")
else:
file_dict[key] = val
output.append(file_dict)
if args.verbose:
dict_writer.writer.writerow(args.columns)
if all(x == {} for x in output):
sys.exit(1)
dict_writer.writerows(output)
def main(args: argparse.Namespace) -> None:
"""
Main entry point for 'ia list'.
"""
item = args.session.get_item(args.identifier)
files = item.files
setup_columns(args, files)
files = filter_files(args, files, item)
dict_writer = csv.DictWriter(sys.stdout, args.columns,
delimiter="\t",
lineterminator="\n")
generate_output(files, args, dict_writer, item)

View file

@ -0,0 +1,324 @@
"""
ia_metadata.py
'ia' subcommand for modifying and retrieving metadata from archive.org items.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import csv
import sys
from collections import defaultdict
from copy import copy
from typing import Mapping
from requests import Response
from internetarchive import item
from internetarchive.cli.cli_utils import MetadataAction, QueryStringAction
from internetarchive.exceptions import ItemLocateError
from internetarchive.utils import json
def setup(subparsers):
"""
Setup args for metadata command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("metadata",
aliases=["md"],
help="Retrieve and modify archive.org item metadata")
parser.add_argument("identifier",
nargs="?",
type=str,
help="Identifier for the upload")
# Mutually exclusive group for metadata modification options
modify_group = parser.add_mutually_exclusive_group()
modify_group.add_argument("-m", "--modify",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
help="Modify the metadata of an item")
modify_group.add_argument("-r", "--remove",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
help="Remove KEY:VALUE from a metadata element")
modify_group.add_argument("-a", "--append",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
help="Append a string to a metadata element")
modify_group.add_argument("-A", "--append-list",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
help="Append a field to a metadata element")
modify_group.add_argument("-i", "--insert",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
help=("Insert a value into a multi-value field given "
"an index (e.g. `--insert=collection[0]:foo`)"))
# Additional options
parser.add_argument("-E", "--expect",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
help=("Test an expectation server-side before applying patch "
"to item metadata"))
parser.add_argument("-H", "--header",
nargs="+",
action=QueryStringAction,
metavar="KEY:VALUE",
help="S3 HTTP headers to send with your request")
parser.add_argument("-t", "--target",
metavar="target",
default="metadata",
help="The metadata target to modify")
parser.add_argument("-s", "--spreadsheet",
metavar="metadata.csv",
help="Modify metadata in bulk using a spreadsheet as input")
parser.add_argument("-e", "--exists",
action="store_true",
help="Check if an item exists")
parser.add_argument("-F", "--formats",
action="store_true",
help="Return the file-formats the given item contains")
parser.add_argument("-p", "--priority",
metavar="priority",
help="Set the task priority")
parser.add_argument("--timeout",
metavar="value",
help="Set a timeout for metadata writes")
parser.add_argument("-R", "--reduced-priority",
action="store_true",
help="Submit task at a reduced priority.")
parser.set_defaults(func=lambda args: main(args, parser))
def modify_metadata(item: item.Item,
metadata: Mapping,
args: argparse.Namespace,
parser: argparse.ArgumentParser) -> Response:
"""
Modify metadata helper function.
"""
append = bool(args.append)
append_list = bool(args.append_list)
insert = bool(args.insert)
try:
r = item.modify_metadata(metadata, target=args.target, append=append,
expect=args.expect, priority=args.priority,
append_list=append_list, headers=args.header,
insert=insert, reduced_priority=args.reduced_priority,
timeout=args.timeout)
assert isinstance(r, Response) # mypy: modify_metadata() -> Request | Response
except ItemLocateError as exc:
print(f"{item.identifier} - error: {exc}", file=sys.stderr)
sys.exit(1)
if not r.json()["success"]:
error_msg = r.json()["error"]
etype = "warning" if "no changes" in r.text else "error"
print(f"{item.identifier} - {etype} ({r.status_code}): {error_msg}", file=sys.stderr)
return r
print(f"{item.identifier} - success: {r.json()['log']}", file=sys.stderr)
return r
def remove_metadata(item: item.Item,
metadata: Mapping,
args: argparse.Namespace,
parser: argparse.ArgumentParser) -> Response:
"""
Remove metadata helper function.
"""
md: dict[str, list | str] = defaultdict(list)
for key in metadata:
src_md = {}
if args.target.startswith("files/"):
for f in item.get_files():
if f.name == "/".join(args.target.split("/")[1:]):
src_md = f.__dict__.get(key, {})
break
else:
src_md = copy(item.metadata.get(key, {}))
if not src_md:
continue
if key == "collection":
_col = copy(metadata[key])
_src_md = copy(src_md)
if not isinstance(_col, list):
_col = [_col]
if not isinstance(_src_md, list):
_src_md = [_src_md] # type: ignore
for c in _col:
if c not in _src_md:
r = item.remove_from_simplelist(c, "holdings")
j = r.json()
if j.get("success"):
print(f"{item.identifier} - success: {item.identifier} no longer in {c}",
file=sys.stderr)
sys.exit(0)
elif j.get("error", "").startswith("no row to delete for"):
print(f"{item.identifier} - success: {item.identifier} no longer in {c}",
file=sys.stderr)
sys.exit(0)
else:
print(f"{item.identifier} - error: {j.get('error')}", file=sys.stderr)
sys.exit(1)
if not isinstance(src_md, list):
if key == "subject":
if isinstance(src_md, str):
src_md = src_md.split(";")
elif key == "collection":
print(f"{item.identifier} - error: all collections would be removed, "
"not submitting task.", file=sys.stderr)
sys.exit(1)
if src_md == metadata[key]:
md[key] = "REMOVE_TAG"
continue
for x in src_md:
if isinstance(metadata[key], list):
if x not in metadata[key]:
md[key].append(x) # type: ignore
else:
if x != metadata[key]:
md[key].append(x) # type: ignore
if len(md[key]) == len(src_md):
del md[key]
if md.get("collection") == []:
print(f"{item.identifier} - error: all collections would be removed, not submitting task.",
file=sys.stderr)
sys.exit(1)
elif not md:
print(f"{item.identifier} - warning: nothing needed to be removed.", file=sys.stderr)
sys.exit(0)
r = modify_metadata(item, md, args, parser)
return r
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""
Main entry point for 'ia metadata'.
"""
formats = set()
responses: list[bool | Response] = []
item = args.session.get_item(args.identifier)
# Check existence of item.
if args.exists:
if item.exists:
responses.append(True)
print(f"{args.identifier} exists", file=sys.stderr)
else:
responses.append(False)
print(f"{args.identifier} does not exist", file=sys.stderr)
if all(r is True for r in responses):
sys.exit(0)
else:
sys.exit(1)
# Modify metadata.
elif (args.modify or args.append or args.append_list
or args.remove or args.insert):
# TODO: Find a better way to handle this.
if args.modify:
metadata = args.modify
elif args.append:
metadata = args.append
elif args.append_list:
metadata = args.append_list
elif args.insert:
metadata = args.insert
if args.remove:
metadata = args.remove
if args.remove:
responses.append(remove_metadata(item, metadata, args, parser))
else:
responses.append(modify_metadata(item, metadata, args, parser))
if all(r.status_code == 200 for r in responses): # type: ignore
sys.exit(0)
else:
for r in responses:
assert isinstance(r, Response)
if r.status_code == 200:
continue
# We still want to exit 0 if the non-200 is a
# "no changes to xml" error.
elif "no changes" in r.text:
continue
else:
sys.exit(1)
# Get metadata.
elif args.formats:
for f in item.get_files():
formats.add(f.format)
print("\n".join(formats))
# Edit metadata for items in bulk, using a spreadsheet as input.
elif args.spreadsheet:
if not args.priority:
args.priority = -5
with open(args.spreadsheet, newline="", encoding="utf-8-sig") as csvfp:
spreadsheet = csv.DictReader(csvfp)
responses = []
for row in spreadsheet:
if not row["identifier"]:
continue
item = args.session.get_item(row["identifier"])
if row.get("file"):
del row["file"]
metadata = {k.lower(): v for k, v in row.items() if v}
responses.append(modify_metadata(item, metadata, args, parser))
if all(r.status_code == 200 for r in responses): # type: ignore
sys.exit(0)
else:
for r in responses:
assert isinstance(r, Response)
if r.status_code == 200:
continue
# We still want to exit 0 if the non-200 is a
# "no changes to xml" error.
elif "no changes" in r.text:
continue
else:
sys.exit(1)
# Dump JSON to stdout.
else:
metadata_str = json.dumps(item.item_metadata)
print(metadata_str)

View file

@ -0,0 +1,97 @@
"""
ia_move.py
'ia' subcommand for moving files on archive.org
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import sys
from internetarchive.cli import ia_copy
from internetarchive.cli.cli_utils import MetadataAction, QueryStringAction
def setup(subparsers):
"""
Setup args for move command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("move",
aliases=["mv"],
help="Move and rename files in archive.org items")
# Positional arguments
parser.add_argument("source",
metavar="SOURCE",
help="Source file formatted as: identifier/file")
parser.add_argument("destination",
metavar="DESTINATION",
help="Destination file formatted as: identifier/file")
# Options
parser.add_argument("-m", "--metadata",
metavar="KEY:VALUE",
nargs="+",
action=MetadataAction,
help=("Metadata to add to your new item, "
"if you are moving the file to a new item"))
parser.add_argument("-H", "--header",
metavar="KEY:VALUE",
nargs="+",
action=QueryStringAction,
default={},
help="S3 HTTP headers to send with your request")
parser.add_argument("--replace-metadata",
action="store_true",
help=("Only use metadata specified as argument, do not copy any "
"from the source item"))
parser.add_argument("--ignore-file-metadata",
action="store_true",
help="Do not copy file metadata")
parser.add_argument("-n", "--no-derive",
action="store_true",
help="Do not derive uploaded files")
parser.add_argument("--no-backup",
action="store_true",
help=("Turn off archive.org backups, "
"clobbered files will not be saved to 'history/files/$key.~N~'"))
parser.set_defaults(func=lambda args: main(args, parser))
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""
Main entry point for ia move command.
"""
# Add keep-old-version by default.
if not args.header.get("x-archive-keep-old-version") and not args.no_backup:
args.header["x-archive-keep-old-version"] = "1"
# Call ia_copy.
_, src_file = ia_copy.main(args, cmd="move", parser=parser)
if src_file:
dr = src_file.delete(headers=args.header, cascade_delete=True)
else:
print(f"error: {src_file} does not exist", file=sys.stderr)
sys.exit(1)
if dr.status_code == 204:
print(f"success: moved '{args.source}' to '{args.destination}'", file=sys.stderr)
sys.exit(0)
print(f"error: {dr.content}", file=sys.stderr)

View file

@ -0,0 +1,130 @@
"""
ia_reviews.py
'ia' subcommand for listing, submitting, and deleting reviews for archive.org items.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import sys
from requests.exceptions import HTTPError
def setup(subparsers):
"""
Setup args for list command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("reviews",
aliases=["re"],
help="submit and modify reviews for archive.org items")
# Positional arguments
parser.add_argument("identifier",
type=str,
help="identifier of the item")
# Options
parser.add_argument("-d", "--delete",
action="store_true",
help="delete your review")
parser.add_argument("-t", "--title",
type=str,
help="the title of your review")
parser.add_argument("-b", "--body",
type=str,
help="the body of your review")
parser.add_argument("-s", "--stars",
type=int,
help="the number of stars for your review")
parser.add_argument("-i", "--index",
action="store_true",
help="Index a review")
parser.add_argument("-n", "--noindex",
action="store_true",
help="Remove a review from the index")
# Conditional arguments that require --delete
delete_group = parser.add_argument_group("delete options",
("these options are used with "
"the --delete flag"))
delete_group.add_argument("-u", "--username",
type=str,
help="delete reviews for a specific user given USERNAME")
delete_group.add_argument("-S", "--screenname",
type=str,
help="delete reviews for a specific user given SCREENNAME")
delete_group.add_argument("-I", "--itemname",
type=str,
help="delete reviews for a specific user given ITEMNAME")
parser.set_defaults(func=lambda args: main(args, parser))
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""
Main entry point for 'ia reviews'.
"""
item = args.session.get_item(args.identifier)
if args.index:
r = item.index_review(username=args.username,
screenname=args.screenname,
itemname=args.itemname)
if r.json().get("success"):
print(f"{item.identifier} - success: review indexed", file=sys.stderr)
sys.exit(0)
elif args.noindex:
r = item.noindex_review(username=args.username,
screenname=args.screenname,
itemname=args.itemname)
if r.json().get("success"):
print(f"{item.identifier} - success: review removed from index", file=sys.stderr)
sys.exit(0)
if args.delete:
r = item.delete_review(username=args.username,
screenname=args.screenname,
itemname=args.itemname)
elif not args.body and not args.title:
try:
r = item.get_review()
print(r.text)
sys.exit(0)
except HTTPError as exc:
if exc.response.status_code == 404: # type: ignore
sys.exit(0)
else:
raise exc
else:
if (args.title and not args.body) or (args.body and not args.title):
parser.error("both --title and --body must be provided")
r = item.review(args.title, args.body, args.stars)
j = r.json()
if j.get("success") or "no change detected" in j.get("error", "").lower():
task_id = j.get("value", {}).get("task_id")
if task_id:
print((f"{item.identifier} - success: "
f"https://catalogd.archive.org/log/{task_id}"),
file=sys.stderr)
else:
print(f"{item.identifier} - warning: no changes detected!", file=sys.stderr)
sys.exit(0)
else:
print(f"{item.identifier} - error: {j.get('error')}", file=sys.stderr)
sys.exit(1)

View file

@ -0,0 +1,202 @@
"""
ia_search.py
'ia' subcommand for searching items on archive.org.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import sys
from itertools import chain
from requests.exceptions import ConnectTimeout, ReadTimeout
from internetarchive.cli.cli_utils import FlattenListAction, QueryStringAction
from internetarchive.exceptions import AuthenticationError
from internetarchive.utils import json
def setup(subparsers):
"""
Setup args for search command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("search",
aliases=["se"],
help="Search items on archive.org")
# Positional arguments
parser.add_argument("query",
type=str,
help="Search query or queries.")
# Optional arguments
parser.add_argument("-p", "--parameters",
nargs="+",
action=QueryStringAction,
metavar="KEY:VALUE",
help="Parameters to send with your query.")
parser.add_argument("-H", "--header",
nargs="+",
action=QueryStringAction,
metavar="KEY:VALUE",
help="Add custom headers to your search request.")
parser.add_argument("-s", "--sort",
action="append",
help=("Sort search results by specified fields. "
"See https://archive.org/advancedsearch.php "
"for full list of sort values"
" (e.g. --sort 'date desc', --sort 'date asc', etc.)."))
parser.add_argument("-i", "--itemlist",
action="store_true",
help="Output identifiers only.")
parser.add_argument("-f", "--field",
nargs="+",
action=FlattenListAction,
help="Metadata fields to return.")
parser.add_argument("-n", "--num-found",
action="store_true",
help="Print the number of results to stdout.")
parser.add_argument("-F", "--fts",
action="store_true",
help="Beta support for querying the archive.org full text search API.")
parser.add_argument("-D", "--dsl-fts",
action="store_true",
help="Submit --fts query in dsl.")
parser.add_argument("-t", "--timeout",
type=float,
default=300,
help="Set the timeout in seconds.")
parser.set_defaults(func=lambda args: main(args, parser))
def prepare_values(value):
"""
Prepare comma-separated values based on the input value.
"""
if value:
return list(chain.from_iterable([x.split(",") for x in value]))
return None
def perform_search(args, fields, sorts, r_kwargs):
"""
Perform the search using the provided arguments and request kwargs.
"""
return args.session.search_items(args.query, # type: ignore
fields=fields,
sorts=sorts,
params=args.parameters,
full_text_search=args.fts,
dsl_fts=args.dsl_fts,
request_kwargs=r_kwargs)
def handle_search_results(args, search):
"""
Handle search results based on command-line arguments.
"""
if args.num_found:
print(search.num_found)
sys.exit(0)
for result in search:
if args.itemlist:
if args.fts or args.dsl_fts:
print("\n".join(result.get("fields", {}).get("identifier")))
else:
print(result.get("identifier", ""))
else:
print(json.dumps(result))
if result.get("error"):
sys.exit(1)
def handle_value_error(exc):
"""
Handle ValueError exception.
"""
return f"error: {exc}"
def handle_connect_timeout():
"""
Handle ConnectTimeout exception.
"""
return "error: Request timed out. Increase the --timeout and try again."
def handle_read_timeout():
"""
Handle ReadTimeout exception.
"""
return "error: The server timed out and failed to return all search results, please try again"
def handle_authentication_error(exc):
"""
Handle AuthenticationError exception.
"""
return f"error: {exc}"
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""
Main entry point for 'ia search'.
"""
try:
# Prepare fields and sorts.
fields = prepare_values(args.field)
sorts = prepare_values(args.sort)
# Prepare request kwargs.
r_kwargs = {
"headers": args.header,
"timeout": args.timeout,
}
# Perform search.
search = perform_search(args, fields, sorts, r_kwargs)
# Handle search results.
handle_search_results(args, search)
except ValueError as exc:
error_message = handle_value_error(exc)
print(error_message, file=sys.stderr)
sys.exit(1)
except ConnectTimeout:
error_message = handle_connect_timeout()
print(error_message, file=sys.stderr)
sys.exit(1)
except ReadTimeout:
error_message = handle_read_timeout()
print(error_message, file=sys.stderr)
sys.exit(1)
except AuthenticationError as exc:
error_message = handle_authentication_error(exc)
print(error_message, file=sys.stderr)
sys.exit(1)

View file

@ -0,0 +1,146 @@
"""
ia_simplelists.py
'ia' subcommand for managing simplelists on archive.org.
"""
# Copyright (C) 2012-2025 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations
import argparse
import sys
from internetarchive.utils import json
def setup(subparsers):
"""Set up argument parser for the 'simplelists' subcommand.
Args:
subparsers: argparse subparsers object from main CLI
"""
parser = subparsers.add_parser("simplelists",
aliases=["sl"],
help="Manage simplelists")
parser.add_argument(
"identifier",
nargs="?",
type=str,
help="Identifier for the upload"
)
group = parser.add_argument_group("List operations")
group.add_argument(
"-p", "--list-parents",
action="store_true",
help="List parent lists for the given identifier"
)
group.add_argument(
"-c", "--list-children",
action="store_true",
help="List children in parent list"
)
group.add_argument(
"-l", "--list-name",
type=str,
help="Name of the list to operate on"
)
group = parser.add_argument_group("Modification operations")
group.add_argument(
"-s", "--set-parent",
metavar="PARENT",
type=str,
help="Add identifier to specified parent list"
)
group.add_argument(
"-n", "--notes",
metavar="NOTES",
type=str,
help="Notes to attach to the list membership"
)
group.add_argument(
"-r", "--remove-parent",
metavar="PARENT",
type=str,
help="Remove identifier from specified parent list"
)
parser.set_defaults(func=lambda args: main(args, parser))
def submit_patch(patch, args):
"""Submit patch request to simplelists API"""
data = {"-patch": json.dumps(patch), "-target": "simplelists"}
url = f"{args.session.protocol}//{args.session.host}/metadata/{args.identifier}"
return args.session.post(url, data=data)
def _handle_patch_operation(args, parser, operation):
"""Handle set/delete patch operations for simplelists.
:param operation: The patch operation type ('set' or 'delete')
"""
if not args.identifier:
parser.error("Missing required identifier argument")
if not args.list_name:
parser.error("Must specify list name with -l/--list-name")
patch = {
"op": operation,
"parent": args.set_parent or args.remove_parent,
"list": args.list_name,
}
if args.notes:
patch["notes"] = args.notes
r = submit_patch(patch, args)
try:
r.raise_for_status()
print(f"success: {args.identifier}")
except Exception as e:
print(f"error: {args.identifier} - {e!s}", file=sys.stderr)
sys.exit(1)
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""Handle simplelists subcommand execution.
Args:
args: Parsed command-line arguments
parser: Argument parser for error handling
"""
if args.list_parents:
item = args.session.get_item(args.identifier)
simplelists = item.item_metadata.get("simplelists")
if simplelists:
print(json.dumps(simplelists))
elif args.list_children:
args.list_name = args.list_name or "catchall"
query = f"simplelists__{args.list_name}:{args.identifier or '*'}"
for result in args.session.search_items(query):
print(json.dumps(result))
elif args.set_parent:
_handle_patch_operation(args, parser, "set")
elif args.remove_parent:
_handle_patch_operation(args, parser, "delete")
else:
parser.print_help()
sys.exit(1)

View file

@ -0,0 +1,177 @@
"""
ia_tasks.py
'ia' subcommand for retrieving information about archive.org catalog tasks.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import sys
import warnings
from internetarchive.cli.cli_utils import PostDataAction, QueryStringAction
from internetarchive.utils import json
def setup(subparsers):
"""
Setup args for tasks command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("tasks",
aliases=["ta"],
help="Retrieve information about your archive.org catalog tasks")
parser.add_argument("-t", "--task",
nargs="*",
help="Return information about the given task.")
parser.add_argument("-G", "--get-task-log",
help="Return the given tasks task log.")
parser.add_argument("-p", "--parameter",
nargs="+",
action=QueryStringAction,
default={},
metavar="KEY:VALUE",
help="URL parameters passed to catalog.php.")
parser.add_argument("-T", "--tab-output",
action="store_true",
help="Output task info in tab-delimited columns.")
parser.add_argument("-c", "--cmd",
type=str,
help="The task to submit (e.g., make_dark.php).")
parser.add_argument("-C", "--comment",
type=str,
help="A reasonable explanation for why a task is being submitted.")
parser.add_argument("-a", "--task-args",
nargs="+",
action=QueryStringAction,
default={},
metavar="KEY:VALUE",
help="Args to submit to the Tasks API.")
parser.add_argument("-d", "--data",
nargs="+",
action=PostDataAction,
metavar="KEY:VALUE",
default={},
help="Additional data to send when submitting a task.")
parser.add_argument("-r", "--reduced-priority",
action="store_true",
help="Submit task at a reduced priority.")
parser.add_argument("-l", "--get-rate-limit",
action="store_true",
help="Get rate limit info.")
parser.add_argument("identifier",
type=str,
nargs="?",
help="Identifier for tasks specific operations.")
parser.set_defaults(func=lambda args: main(args, parser))
def handle_task_submission_result(result, cmd):
"""
Handle the result of a task submission.
"""
if result.get("success"):
task_log_url = result.get("value", {}).get("log")
print(f"success: {task_log_url}", file=sys.stderr)
elif "already queued/running" in result.get("error", ""):
print(f"success: {cmd} task already queued/running", file=sys.stderr)
else:
print(f"error: {result.get('error')}", file=sys.stderr)
sys.exit(0 if result.get("success") else 1)
def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
"""
Main entry point for 'ia tasks'.
"""
# Tasks write API.
if args.cmd:
if args.get_rate_limit:
r = args.session.get_tasks_api_rate_limit(args.cmd)
print(json.dumps(r))
sys.exit(0)
args.data["args"] = args.task_args
r = args.session.submit_task(args.identifier,
args.cmd,
comment=args.comment,
priority=int(args.data.get("priority", 0)),
reduced_priority=args.reduced_priority,
data=args.data)
handle_task_submission_result(r.json(), args.cmd)
sys.exit(0)
# Tasks read API.
if args.identifier:
_params = {"identifier": args.identifier, "catalog": 1, "history": 1}
_params.update(args.parameter)
args.parameter = _params
elif args.get_task_log:
log = args.session.get_task_log(args.get_task_log, **args.parameter)
print(log.encode("utf-8", errors="surrogateescape")
.decode("utf-8", errors="replace"))
sys.exit(0)
queryable_params = [
"identifier",
"task_id",
"server",
"cmd",
"args",
"submitter",
"priority",
"wait_admin",
"submittime",
]
if not (args.identifier
or args.parameter.get("task_id")):
_params = {"catalog": 1, "history": 0}
_params.update(args.parameter)
args.parameter = _params
if not any(x in args.parameter for x in queryable_params):
_params = {"submitter": args.session.user_email, "catalog": 1, "history": 0, "summary": 0}
_params.update(args.parameter)
args.parameter = _params
if args.tab_output:
warn_msg = ("tab-delimited output will be removed in a future release. "
"Please switch to the default JSON output.")
warnings.warn(warn_msg, stacklevel=2)
for t in args.session.get_tasks(params=args.parameter):
# Legacy support for tab-delimited output.
# Mypy is confused by CatalogTask members being created from kwargs
if args.tab_output:
color = t.color if t.color else "done"
task_args = "\t".join([f"{k}={v}" for k, v in t.args.items()]) # type: ignore
output = "\t".join([str(x) for x in [
t.identifier,
t.task_id,
t.server,
t.submittime,
t.cmd,
color,
t.submitter,
task_args,
] if x])
print(output, flush=True)
else:
print(t.json(), flush=True)

View file

@ -0,0 +1,376 @@
"""
ia_upload.py
'ia' subcommand for uploading files to archive.org.
"""
# Copyright (C) 2012-2024 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import csv
import os
import sys
import webbrowser
from copy import deepcopy
from locale import getpreferredencoding
from tempfile import TemporaryFile
from typing import Union
from requests.exceptions import HTTPError
from internetarchive.cli.cli_utils import (
MetadataAction,
QueryStringAction,
get_args_dict,
validate_identifier,
)
from internetarchive.utils import (
InvalidIdentifierException,
JSONDecodeError,
is_valid_metadata_key,
json,
)
def setup(subparsers):
"""
Setup args for copy command.
Args:
subparsers: subparser object passed from ia.py
"""
parser = subparsers.add_parser("upload",
aliases=["up"],
help="Upload files to archive.org")
# Positional arguments
parser.add_argument("identifier",
type=validate_identifier,
nargs="?",
default=None,
help="Identifier for the upload")
parser.add_argument("file",
nargs="*",
type=validate_file,
help="File(s) to upload")
# Options
parser.add_argument("-q", "--quiet",
action="store_true",
help="Turn off ia's output")
parser.add_argument("-d", "--debug",
action="store_true",
help=("Print S3 request parameters to stdout and exit without "
"sending request"))
parser.add_argument("-r", "--remote-name",
help=("When uploading data from stdin, "
"this option sets the remote filename"))
parser.add_argument("-m", "--metadata",
nargs="+",
action=MetadataAction,
metavar="KEY:VALUE",
default={},
help="Metadata to add to your item")
parser.add_argument("--spreadsheet",
type=argparse.FileType("r", encoding="utf-8-sig"),
help="Bulk uploading")
parser.add_argument("--file-metadata",
type=argparse.FileType("r"),
help="Upload files with file-level metadata via a file_md.jsonl file")
parser.add_argument("-H", "--header",
nargs="+",
action=QueryStringAction,
default={},
help="S3 HTTP headers to send with your request")
parser.add_argument("-c", "--checksum",
action="store_true",
help="Skip based on checksum")
parser.add_argument("-v", "--verify",
action="store_true",
help="Verify that data was not corrupted traversing the network")
parser.add_argument("-n", "--no-derive",
action="store_true",
help="Do not derive uploaded files")
parser.add_argument("--size-hint",
help="Specify a size-hint for your item")
parser.add_argument("--delete",
action="store_true",
help="Delete files after verifying checksums")
parser.add_argument("-R", "--retries",
type=int,
help="Number of times to retry request if S3 returns a 503 SlowDown error")
parser.add_argument("-s", "--sleep",
type=int,
help="The amount of time to sleep between retries")
parser.add_argument("--no-collection-check",
action="store_true",
help="Skip collection exists check")
parser.add_argument("-o", "--open-after-upload",
action="store_true",
help="Open the details page for an item after upload")
parser.add_argument("--no-backup",
action="store_true",
help="Turn off archive.org backups")
parser.add_argument("--keep-directories",
action="store_true",
help="Keep directories in the supplied file paths for the remote filename")
parser.add_argument("--no-scanner",
action="store_true",
help="Do not set the scanner field in meta.xml")
parser.add_argument("--status-check",
action="store_true",
help="Check if S3 is accepting requests to the given item")
parser.set_defaults(func=lambda args: main(args, parser))
def _upload_files(item, files, upload_kwargs, prev_identifier=None):
"""
Helper function for calling :meth:`Item.upload`
"""
# Check if the list has any element.
if not files:
raise FileNotFoundError("No valid file was found. Check your paths.")
responses = []
if (upload_kwargs["verbose"]) and (prev_identifier != item.identifier):
print(f"{item.identifier}:", file=sys.stderr)
try:
response = item.upload(files, **upload_kwargs)
responses += response
except HTTPError as exc:
responses += [exc.response]
except InvalidIdentifierException as exc:
print(str(exc), file=sys.stderr)
sys.exit(1)
finally:
# Debug mode.
if upload_kwargs["debug"]:
for i, r in enumerate(responses):
if i != 0:
print("---", file=sys.stderr)
headers = "\n".join(
[f" {k}:{v}" for (k, v) in r.headers.items()]
)
print(f"Endpoint:\n {r.url}\n", file=sys.stderr)
print(f"HTTP Headers:\n{headers}", file=sys.stderr)
return responses
def uploading_from_stdin(args):
"""
Check if the user is uploading from stdin.
"""
if not args.file:
return False
elif len(args.file) == 1 and args.file[0] == "-":
return True
return False
def check_if_file_arg_required(args, parser):
required_if_no_file = [args.spreadsheet, args.file_metadata, args.status_check]
if not args.file and not any(required_if_no_file):
parser.error("You must specify a file to upload.")
def validate_file(arg):
if os.path.exists(arg) or arg == "-":
return arg
else:
raise argparse.ArgumentTypeError(f"'{arg}' is not a valid file or directory")
def main(args, parser): # noqa: PLR0912,C901
# TODO: Refactor to deal with PLR0912 and C901
# add type hints
"""
Main entry point for 'ia upload'.
"""
check_if_file_arg_required(args, parser)
if uploading_from_stdin(args) and not args.remote_name:
parser.error("When uploading from stdin, "
"you must specify a remote filename with --remote-name")
if args.status_check: # TODO: support for checking if a specific bucket is overloaded
if args.session.s3_is_overloaded():
print(f"warning: {args.identifier} is over limit, and not accepting requests. "
"Expect 503 SlowDown errors.",
file=sys.stderr)
sys.exit(1)
else:
print(f"success: {args.identifier} is accepting requests.", file=sys.stderr)
sys.exit(0)
elif args.identifier:
item = args.session.get_item(args.identifier)
# Prepare upload headers and kwargs
if args.no_derive:
queue_derive = False
else:
queue_derive = True
if args.quiet:
verbose = False
else:
verbose = True
if args.no_scanner:
set_scanner = False
else:
set_scanner = True
if args.size_hint:
args.header["x-archive-size-hint"] = args.size_hint
if not args.header.get("x-archive-keep-old-version") \
and not args.no_backup:
args.header["x-archive-keep-old-version"] = "1"
if args.file_metadata:
try:
with open(args.file_metadata) as fh:
args.file_metadata = json.load(fh)
except JSONDecodeError:
args.file = []
with open(args.file_metadata) as fh:
for line in fh:
j = json.loads(line.strip())
args.file.append(j)
upload_kwargs = {
"metadata": args.metadata,
"headers": args.header,
"debug": args.debug,
"queue_derive": queue_derive,
"set_scanner": set_scanner,
"verbose": verbose,
"verify": args.verify,
"checksum": args.checksum,
"retries": args.retries,
"retries_sleep": args.sleep,
"delete": args.delete,
"validate_identifier": True,
}
# Upload files
errors = False
if not args.spreadsheet:
if uploading_from_stdin(args):
local_file = TemporaryFile()
# sys.stdin normally has the buffer attribute which returns bytes.
# However, this might not always be the case, e.g. on mocking for test purposes.
# Fall back to reading as str and encoding back to bytes.
# Note that the encoding attribute might also be None. In that case, fall back to
# locale.getpreferredencoding, the default of io.TextIOWrapper and open().
if hasattr(sys.stdin, "buffer"):
def read():
return sys.stdin.buffer.read(1048576)
else:
encoding = sys.stdin.encoding or getpreferredencoding(False)
def read():
return sys.stdin.read(1048576).encode(encoding)
while True:
data = read()
if not data:
break
local_file.write(data)
local_file.seek(0)
else:
local_file = args.file
# Properly expand a period to the contents of the current working directory.
if isinstance(local_file, str) and "." in local_file:
local_file = [p for p in local_file if p != "."]
local_file = os.listdir(".") + local_file
if isinstance(local_file, (list, tuple, set)) and args.remote_name:
local_file = local_file[0]
if args.remote_name:
files = {args.remote_name: local_file}
elif args.keep_directories:
files = {f: f for f in local_file}
else:
files = local_file
for _r in _upload_files(item, files, upload_kwargs):
if args.debug:
break
# Check if Response is empty first (i.e. --checksum)
# TODO: Should upload return something other than an empty Response
# object if checksum is set and the file is already in the item?
if _r.status_code is None:
pass
elif not _r.ok:
errors = True
else:
if args.open_after_upload:
url = f"{args.session.protocol}//{args.session.host}/details/{item.identifier}"
webbrowser.open_new_tab(url)
# Bulk upload using spreadsheet.
else:
# Use the same session for each upload request.
with args.spreadsheet as csvfp:
spreadsheet = csv.DictReader(csvfp)
prev_identifier = None
for row in spreadsheet:
for metadata_key in row:
if not is_valid_metadata_key(metadata_key):
print(f"error: '{metadata_key}' is not a valid metadata key.",
file=sys.stderr)
sys.exit(1)
upload_kwargs_copy = deepcopy(upload_kwargs)
if row.get("REMOTE_NAME"):
local_file = {row["REMOTE_NAME"]: row["file"]}
del row["REMOTE_NAME"]
elif args.keep_directories:
local_file = {row["file"]: row["file"]}
else:
local_file = row["file"]
identifier = row.get("item", row.get("identifier"))
if not identifier:
if not prev_identifier:
print("error: no identifier column on spreadsheet.",
file=sys.stderr)
sys.exit(1)
identifier = prev_identifier
del row["file"]
if "identifier" in row:
del row["identifier"]
if "item" in row:
del row["item"]
item = args.session.get_item(identifier)
# TODO: Clean up how indexed metadata items are coerced
# into metadata.
md_args = [f"{k.lower()}:{v}" for (k, v) in row.items() if v]
metadata = get_args_dict(md_args)
upload_kwargs_copy["metadata"].update(metadata)
r = _upload_files(item, local_file, upload_kwargs_copy, prev_identifier)
for _r in r:
if args.debug:
break
if (not _r.status_code) or (not _r.ok):
errors = True
else:
if args.open_after_upload:
url = (f"{args.session.protocol}//{args.session.host}"
"/details/{identifier}")
webbrowser.open_new_tab(url)
prev_identifier = identifier
if errors:
sys.exit(1)