Post-Local sync at 2025-06-23T22:46:07Z
This commit is contained in:
parent
9d33b42020
commit
9f97801b0d
1387 changed files with 250216 additions and 117 deletions
466
.venv/lib/python3.12/site-packages/internetarchive/utils.py
Normal file
466
.venv/lib/python3.12/site-packages/internetarchive/utils.py
Normal file
|
@ -0,0 +1,466 @@
|
|||
#
|
||||
# The internetarchive module is a Python/CLI interface to Archive.org.
|
||||
#
|
||||
# Copyright (C) 2012-2024 Internet Archive
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
internetarchive.utils
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module provides utility functions for the internetarchive library.
|
||||
|
||||
:copyright: (C) 2012-2024 by Internet Archive.
|
||||
:license: AGPL 3, see LICENSE for more details.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections.abc import Mapping
|
||||
from typing import Iterable
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
# Make preferred JSON package available via `from internetarchive.utils import json`
|
||||
try:
|
||||
import ujson as json
|
||||
|
||||
# ujson lacks a JSONDecodeError: https://github.com/ultrajson/ultrajson/issues/497
|
||||
JSONDecodeError = ValueError
|
||||
except ImportError:
|
||||
import json # type: ignore
|
||||
JSONDecodeError = json.JSONDecodeError # type: ignore
|
||||
|
||||
|
||||
def deep_update(d: dict, u: Mapping) -> dict:
|
||||
for k, v in u.items():
|
||||
if isinstance(v, Mapping):
|
||||
r = deep_update(d.get(k, {}), v)
|
||||
d[k] = r
|
||||
else:
|
||||
d[k] = u[k]
|
||||
return d
|
||||
|
||||
|
||||
class InvalidIdentifierException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def validate_s3_identifier(string: str) -> bool:
|
||||
legal_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-'
|
||||
# periods, underscores, and dashes are legal, but may not be the first
|
||||
# character!
|
||||
if any(string.startswith(c) is True for c in ['.', '_', '-']):
|
||||
raise InvalidIdentifierException('Identifier cannot begin with periods ".", underscores '
|
||||
'"_", or dashes "-".')
|
||||
|
||||
if len(string) > 100 or len(string) < 3:
|
||||
raise InvalidIdentifierException('Identifier should be between 3 and 80 characters in '
|
||||
'length.')
|
||||
|
||||
# Support for uploading to user items, e.g. first character can be `@`.
|
||||
if string.startswith('@'):
|
||||
string = string[1:]
|
||||
|
||||
if any(c not in legal_chars for c in string):
|
||||
raise InvalidIdentifierException('Identifier can only contain alphanumeric characters, '
|
||||
'periods ".", underscores "_", or dashes "-". However, '
|
||||
'identifier cannot begin with periods, underscores, or '
|
||||
'dashes.')
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def needs_quote(s: str) -> bool:
|
||||
try:
|
||||
s.encode('ascii')
|
||||
except (UnicodeDecodeError, UnicodeEncodeError):
|
||||
return True
|
||||
return re.search(r'\s', s) is not None
|
||||
|
||||
|
||||
def norm_filepath(fp: bytes | str) -> str:
|
||||
if isinstance(fp, bytes):
|
||||
fp = fp.decode('utf-8')
|
||||
fp = fp.replace(os.path.sep, '/')
|
||||
if not fp.startswith('/'):
|
||||
fp = f'/{fp}'
|
||||
return fp
|
||||
|
||||
|
||||
def get_md5(file_object) -> str:
|
||||
m = hashlib.md5()
|
||||
while True:
|
||||
data = file_object.read(8192)
|
||||
if not data:
|
||||
break
|
||||
m.update(data)
|
||||
file_object.seek(0, os.SEEK_SET)
|
||||
return m.hexdigest()
|
||||
|
||||
|
||||
def chunk_generator(fp, chunk_size: int):
|
||||
while True:
|
||||
chunk = fp.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
|
||||
def suppress_keyboard_interrupt_message() -> None:
|
||||
"""Register a new excepthook to suppress KeyboardInterrupt
|
||||
exception messages, and exit with status code 130.
|
||||
|
||||
"""
|
||||
old_excepthook = sys.excepthook
|
||||
|
||||
def new_hook(type, value, traceback):
|
||||
if type is KeyboardInterrupt:
|
||||
sys.exit(130)
|
||||
old_excepthook(type, value, traceback)
|
||||
|
||||
sys.excepthook = new_hook
|
||||
|
||||
|
||||
class IterableToFileAdapter:
|
||||
def __init__(self, iterable, size: int, pre_encode: bool = False):
|
||||
self.iterator = iter(iterable)
|
||||
self.length = size
|
||||
# pre_encode is needed because http doesn't know that it
|
||||
# needs to encode a TextIO object when it's wrapped
|
||||
# in the Iterator from tqdm.
|
||||
# So, this FileAdapter provides pre-encoded output
|
||||
self.pre_encode = pre_encode
|
||||
|
||||
def read(self, size: int = -1): # TBD: add buffer for `len(data) > size` case
|
||||
if self.pre_encode:
|
||||
# this adapter is intended to emulate the encoding that is usually
|
||||
# done by the http lib.
|
||||
# As of 2022, iso-8859-1 encoding is used to meet the HTTP standard,
|
||||
# see in the cpython repo (https://github.com/python/cpython
|
||||
# Lib/http/client.py lines 246; 1340; or grep 'iso-8859-1'
|
||||
return next(self.iterator, '').encode("iso-8859-1")
|
||||
return next(self.iterator, b'')
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.length
|
||||
|
||||
|
||||
class IdentifierListAsItems:
|
||||
"""This class is a lazily-loaded list of Items, accessible by index or identifier.
|
||||
"""
|
||||
|
||||
def __init__(self, id_list_or_single_id, session):
|
||||
self.ids = (id_list_or_single_id
|
||||
if isinstance(id_list_or_single_id, list)
|
||||
else [id_list_or_single_id])
|
||||
self._items = [None] * len(self.ids)
|
||||
self.session = session
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.ids)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
for i in (range(*idx.indices(len(self))) if isinstance(idx, slice) else [idx]):
|
||||
if self._items[i] is None:
|
||||
self._items[i] = self.session.get_item(self.ids[i])
|
||||
return self._items[idx]
|
||||
|
||||
def __getattr__(self, name):
|
||||
try:
|
||||
return self[self.ids.index(name)]
|
||||
except ValueError:
|
||||
raise AttributeError
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'{self.__class__.__name__}({self.ids!r})'
|
||||
|
||||
|
||||
def get_s3_xml_text(xml_str: str) -> str:
|
||||
def _get_tag_text(tag_name, xml_obj):
|
||||
text = ''
|
||||
elements = xml_obj.getElementsByTagName(tag_name)
|
||||
for e in elements:
|
||||
for node in e.childNodes:
|
||||
if node.nodeType == node.TEXT_NODE:
|
||||
text += node.data
|
||||
return text
|
||||
|
||||
tag_names = ['Message', 'Resource']
|
||||
try:
|
||||
p = parseString(xml_str)
|
||||
_msg = _get_tag_text('Message', p)
|
||||
_resource = _get_tag_text('Resource', p)
|
||||
# Avoid weird Resource text that contains PUT method.
|
||||
if _resource and "'PUT" not in _resource:
|
||||
return f'{_msg} - {_resource.strip()}'
|
||||
else:
|
||||
return _msg
|
||||
except Exception:
|
||||
return str(xml_str)
|
||||
|
||||
|
||||
def get_file_size(file_obj) -> int | None:
|
||||
if is_filelike_obj(file_obj):
|
||||
try:
|
||||
file_obj.seek(0, os.SEEK_END)
|
||||
size = file_obj.tell()
|
||||
# Avoid OverflowError.
|
||||
if size > sys.maxsize:
|
||||
size = None
|
||||
file_obj.seek(0, os.SEEK_SET)
|
||||
except OSError:
|
||||
size = None
|
||||
else:
|
||||
st = os.stat(file_obj)
|
||||
size = st.st_size
|
||||
return size
|
||||
|
||||
|
||||
def iter_directory(directory: str):
|
||||
"""Given a directory, yield all files recursively as a two-tuple (filepath, s3key)"""
|
||||
for path, _dir, files in os.walk(directory):
|
||||
for f in files:
|
||||
filepath = os.path.join(path, f)
|
||||
key = os.path.relpath(filepath, directory)
|
||||
yield (filepath, key)
|
||||
|
||||
|
||||
def recursive_file_count_and_size(files, item=None, checksum=False):
|
||||
"""Given a filepath or list of filepaths, return the total number and size of files.
|
||||
If `checksum` is `True`, skip over files whose MD5 hash matches any file in the `item`.
|
||||
"""
|
||||
if not isinstance(files, (list, set)):
|
||||
files = [files]
|
||||
total_files = 0
|
||||
total_size = 0
|
||||
if checksum is True:
|
||||
md5s = [f.get('md5') for f in item.files]
|
||||
else:
|
||||
md5s = []
|
||||
if isinstance(files, dict):
|
||||
# make sure to use local filenames.
|
||||
_files = files.values()
|
||||
else:
|
||||
if isinstance(files[0], tuple):
|
||||
_files = dict(files).values()
|
||||
else:
|
||||
_files = files
|
||||
for f in _files:
|
||||
try:
|
||||
is_dir = os.path.isdir(f)
|
||||
except TypeError:
|
||||
try:
|
||||
f = f[0]
|
||||
is_dir = os.path.isdir(f)
|
||||
except (AttributeError, TypeError):
|
||||
is_dir = False
|
||||
if is_dir:
|
||||
it = iter_directory(f)
|
||||
else:
|
||||
it = [(f, None)]
|
||||
for x, _ in it:
|
||||
if checksum is True:
|
||||
try:
|
||||
with open(x, 'rb') as fh:
|
||||
lmd5 = get_md5(fh)
|
||||
except TypeError:
|
||||
# Support file-like objects.
|
||||
lmd5 = get_md5(x)
|
||||
if lmd5 in md5s:
|
||||
continue
|
||||
total_size += get_file_size(x)
|
||||
total_files += 1
|
||||
return total_files, total_size
|
||||
|
||||
|
||||
def recursive_file_count(*args, **kwargs):
|
||||
"""Like `recursive_file_count_and_size`, but returns only the file count."""
|
||||
total_files, _ = recursive_file_count_and_size(*args, **kwargs)
|
||||
return total_files
|
||||
|
||||
|
||||
def is_dir(obj) -> bool:
|
||||
"""Special is_dir function to handle file-like object cases that
|
||||
cannot be stat'd"""
|
||||
try:
|
||||
return os.path.isdir(obj)
|
||||
except TypeError as exc:
|
||||
return False
|
||||
|
||||
|
||||
def is_filelike_obj(obj) -> bool:
|
||||
"""Distinguish file-like from path-like objects"""
|
||||
try:
|
||||
os.fspath(obj)
|
||||
except TypeError:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def reraise_modify(
|
||||
caught_exc: Exception,
|
||||
append_msg: str,
|
||||
prepend: bool = False,
|
||||
) -> None:
|
||||
"""Append message to exception while preserving attributes.
|
||||
|
||||
Preserves exception class, and exception traceback.
|
||||
|
||||
Note:
|
||||
This function needs to be called inside an except because an exception
|
||||
must be active in the current scope.
|
||||
|
||||
Args:
|
||||
caught_exc(Exception): The caught exception object
|
||||
append_msg(str): The message to append to the caught exception
|
||||
prepend(bool): If True prepend the message to args instead of appending
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Side Effects:
|
||||
Re-raises the exception with the preserved data / trace but
|
||||
modified message
|
||||
"""
|
||||
if not caught_exc.args:
|
||||
# If no args, create our own tuple
|
||||
arg_list = [append_msg]
|
||||
else:
|
||||
# Take the last arg
|
||||
# If it is a string
|
||||
# append your message.
|
||||
# Otherwise append it to the
|
||||
# arg list(Not as pretty)
|
||||
arg_list = list(caught_exc.args[:-1])
|
||||
last_arg = caught_exc.args[-1]
|
||||
if isinstance(last_arg, str):
|
||||
if prepend:
|
||||
arg_list.append(append_msg + last_arg)
|
||||
else:
|
||||
arg_list.append(last_arg + append_msg)
|
||||
else:
|
||||
arg_list += [last_arg, append_msg]
|
||||
caught_exc.args = tuple(arg_list)
|
||||
raise # noqa: PLE0704
|
||||
|
||||
|
||||
def remove_none(obj):
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
lst = type(obj)(remove_none(x) for x in obj if x)
|
||||
try:
|
||||
return [dict(t) for t in {tuple(sorted(d.items())) for d in lst}]
|
||||
except (AttributeError, TypeError):
|
||||
return lst
|
||||
elif isinstance(obj, dict):
|
||||
return type(obj)((remove_none(k), remove_none(v))
|
||||
for k, v in obj.items() if k is not None and v is not None)
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
def delete_items_from_dict(d: dict | list, to_delete):
|
||||
"""Recursively deletes items from a dict,
|
||||
if the item's value(s) is in ``to_delete``.
|
||||
"""
|
||||
if not isinstance(to_delete, list):
|
||||
to_delete = [to_delete]
|
||||
if isinstance(d, dict):
|
||||
for single_to_delete in set(to_delete):
|
||||
if single_to_delete in d.values():
|
||||
for k, v in d.copy().items():
|
||||
if v == single_to_delete:
|
||||
del d[k]
|
||||
for v in d.values():
|
||||
delete_items_from_dict(v, to_delete)
|
||||
elif isinstance(d, list):
|
||||
for i in d:
|
||||
delete_items_from_dict(i, to_delete)
|
||||
return remove_none(d)
|
||||
|
||||
|
||||
def is_valid_metadata_key(name: str) -> bool:
|
||||
# According to the documentation a metadata key
|
||||
# has to be a valid XML tag name.
|
||||
#
|
||||
# The actual allowed tag names (at least as tested with the metadata API),
|
||||
# are way more restrictive and only allow ".-A-Za-z_", possibly followed
|
||||
# by an index in square brackets e. g. [0].
|
||||
# On the other hand the Archive allows tags starting with the string "xml".
|
||||
return bool(re.fullmatch(r'[A-Za-z][.\-0-9A-Za-z_]+(?:\[[0-9]+\])?', name))
|
||||
|
||||
|
||||
def merge_dictionaries(
|
||||
dict0: dict | None,
|
||||
dict1: dict | None,
|
||||
keys_to_drop: Iterable | None = None,
|
||||
) -> dict:
|
||||
"""Merge two dictionaries.
|
||||
|
||||
Items in `dict0` can optionally be dropped before the merge.
|
||||
|
||||
If equal keys exist in both dictionaries,
|
||||
entries in`dict0` are overwritten.
|
||||
|
||||
:param dict0: A base dictionary with the bulk of the items.
|
||||
|
||||
:param dict1: Additional items which overwrite the items in `dict0`.
|
||||
|
||||
:param keys_to_drop: An iterable of keys to drop from `dict0` before the merge.
|
||||
|
||||
:returns: A merged dictionary.
|
||||
"""
|
||||
if dict0 is not None:
|
||||
new_dict = dict0.copy()
|
||||
else:
|
||||
new_dict = {}
|
||||
|
||||
if keys_to_drop is not None:
|
||||
for key in keys_to_drop:
|
||||
new_dict.pop(key, None)
|
||||
|
||||
# Items from `dict1` take precedence over items from `dict0`.
|
||||
if dict1 is not None:
|
||||
new_dict.update(dict1)
|
||||
|
||||
return new_dict
|
||||
|
||||
|
||||
def parse_dict_cookies(value: str) -> dict[str, str | None]:
|
||||
result: dict[str, str | None] = {}
|
||||
for item in value.split(';'):
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
if '=' not in item:
|
||||
result[item] = None
|
||||
continue
|
||||
name, value = item.split('=', 1)
|
||||
result[name] = value
|
||||
if 'domain' not in result:
|
||||
result['domain'] = '.archive.org'
|
||||
if 'path' not in result:
|
||||
result['path'] = '/'
|
||||
return result
|
||||
|
||||
|
||||
def is_valid_email(email):
|
||||
# Regular expression pattern for a valid email address
|
||||
# Ensures the TLD has at least 2 characters
|
||||
pattern = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$'
|
||||
return re.match(pattern, email) is not None
|
Loading…
Add table
Add a link
Reference in a new issue