Post-Local sync at 2025-06-23T22:46:07Z
This commit is contained in:
parent
9d33b42020
commit
9f97801b0d
1387 changed files with 250216 additions and 117 deletions
304
.venv/lib/python3.12/site-packages/internetarchive/search.py
Normal file
304
.venv/lib/python3.12/site-packages/internetarchive/search.py
Normal file
|
@ -0,0 +1,304 @@
|
|||
#
|
||||
# The internetarchive module is a Python/CLI interface to Archive.org.
|
||||
#
|
||||
# Copyright (C) 2012-2024 Internet Archive
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
internetarchive.search
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module provides objects for interacting with the Archive.org
|
||||
search engine.
|
||||
|
||||
:copyright: (C) 2012-2024 by Internet Archive.
|
||||
:license: AGPL 3, see LICENSE for more details.
|
||||
"""
|
||||
import itertools
|
||||
from logging import getLogger
|
||||
|
||||
from requests.exceptions import ReadTimeout
|
||||
|
||||
from internetarchive.auth import S3Auth
|
||||
|
||||
log = getLogger(__name__)
|
||||
|
||||
|
||||
class Search:
|
||||
"""This class represents an archive.org item search. You can use
|
||||
this class to search for Archive.org items using the advanced search
|
||||
engine.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> from internetarchive.session import ArchiveSession
|
||||
>>> from internetarchive.search import Search
|
||||
>>> s = ArchiveSession()
|
||||
>>> search = Search(s, '(uploader:jake@archive.org)')
|
||||
>>> for result in search:
|
||||
... print(result['identifier'])
|
||||
"""
|
||||
|
||||
def __init__(self, archive_session, query,
|
||||
fields=None,
|
||||
sorts=None,
|
||||
params=None,
|
||||
full_text_search=None,
|
||||
dsl_fts=None,
|
||||
request_kwargs=None,
|
||||
max_retries=None):
|
||||
params = params or {}
|
||||
|
||||
self.session = archive_session
|
||||
self.dsl_fts = False if not dsl_fts else True
|
||||
if self.dsl_fts or full_text_search:
|
||||
self.fts = True
|
||||
else:
|
||||
self.fts = False
|
||||
self.query = query
|
||||
if self.fts and not self.dsl_fts:
|
||||
self.query = f'!L {self.query}'
|
||||
self.fields = fields or []
|
||||
self.sorts = sorts or []
|
||||
self.request_kwargs = request_kwargs or {}
|
||||
self._num_found = None
|
||||
self.fts_url = f'{self.session.protocol}//be-api.us.archive.org/ia-pub-fts-api'
|
||||
self.scrape_url = f'{self.session.protocol}//{self.session.host}/services/search/v1/scrape'
|
||||
self.search_url = f'{self.session.protocol}//{self.session.host}/advancedsearch.php'
|
||||
if self.session.access_key and self.session.secret_key:
|
||||
self.auth = S3Auth(self.session.access_key, self.session.secret_key)
|
||||
else:
|
||||
self.auth = None
|
||||
self.max_retries = max_retries if max_retries is not None else 5
|
||||
|
||||
# Initialize params.
|
||||
default_params = {'q': self.query}
|
||||
if 'page' not in params:
|
||||
if 'rows' in params:
|
||||
params['page'] = 1
|
||||
else:
|
||||
default_params['count'] = 10000
|
||||
else:
|
||||
default_params['output'] = 'json'
|
||||
# In the beta endpoint 'scope' was called 'index'.
|
||||
# Let's support both for a while.
|
||||
if 'index' in params:
|
||||
params['scope'] = params['index']
|
||||
del params['index']
|
||||
self.params = default_params.copy()
|
||||
self.params.update(params)
|
||||
|
||||
# Set timeout.
|
||||
if 'timeout' not in self.request_kwargs:
|
||||
self.request_kwargs['timeout'] = 300
|
||||
|
||||
# Set retries.
|
||||
self.session.mount_http_adapter(max_retries=self.max_retries)
|
||||
|
||||
def __repr__(self):
|
||||
return f'Search(query={self.query!r})'
|
||||
|
||||
def __iter__(self):
|
||||
return self.iter_as_results()
|
||||
|
||||
def _advanced_search(self):
|
||||
# Always return identifier.
|
||||
if 'identifier' not in self.fields:
|
||||
self.fields.append('identifier')
|
||||
for k, v in enumerate(self.fields):
|
||||
self.params[f'fl[{k}]'] = v
|
||||
|
||||
for i, field in enumerate(self.sorts):
|
||||
self.params[f'sort[{i}]'] = field
|
||||
|
||||
self.params['output'] = 'json'
|
||||
|
||||
r = self.session.get(self.search_url,
|
||||
params=self.params,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
num_found = int(j.get('response', {}).get('numFound', 0))
|
||||
if not self._num_found:
|
||||
self._num_found = num_found
|
||||
if j.get('error'):
|
||||
yield j
|
||||
yield from j.get('response', {}).get('docs', [])
|
||||
|
||||
def _scrape(self):
|
||||
if self.fields:
|
||||
self.params['fields'] = ','.join(self.fields)
|
||||
if self.sorts:
|
||||
self.params['sorts'] = ','.join(self.sorts)
|
||||
i = 0
|
||||
num_found = None
|
||||
while True:
|
||||
r = self.session.post(self.scrape_url,
|
||||
params=self.params,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
if j.get('error'):
|
||||
yield j
|
||||
if not num_found:
|
||||
num_found = int(j.get('total') or '0')
|
||||
if not self._num_found:
|
||||
self._num_found = num_found
|
||||
self._handle_scrape_error(j)
|
||||
|
||||
self.params['cursor'] = j.get('cursor')
|
||||
for item in j['items']:
|
||||
i += 1
|
||||
yield item
|
||||
if 'cursor' not in j:
|
||||
if i != num_found:
|
||||
raise ReadTimeout('The server failed to return results in the'
|
||||
f' allotted amount of time for {r.request.url}')
|
||||
break
|
||||
|
||||
def _full_text_search(self):
|
||||
d = {
|
||||
'q': self.query,
|
||||
'size': '10000',
|
||||
'from': '0',
|
||||
'scroll': 'true',
|
||||
}
|
||||
|
||||
if 'scope' in self.params:
|
||||
d['scope'] = self.params['scope']
|
||||
|
||||
if 'size' in self.params:
|
||||
d['scroll'] = False
|
||||
d['size'] = self.params['size']
|
||||
|
||||
while True:
|
||||
r = self.session.post(self.fts_url,
|
||||
json=d,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
scroll_id = j.get('_scroll_id')
|
||||
hits = j.get('hits', {}).get('hits')
|
||||
if not hits:
|
||||
return
|
||||
yield from hits
|
||||
if not hits or d['scroll'] is False:
|
||||
break
|
||||
d['scroll_id'] = scroll_id
|
||||
|
||||
def _make_results_generator(self):
|
||||
if self.fts:
|
||||
return self._full_text_search()
|
||||
if 'user_aggs' in self.params:
|
||||
return self._user_aggs()
|
||||
elif 'page' in self.params:
|
||||
return self._advanced_search()
|
||||
else:
|
||||
return self._scrape()
|
||||
|
||||
def _user_aggs(self):
|
||||
"""Experimental support for user aggregations.
|
||||
"""
|
||||
del self.params['count'] # advanced search will error if this param is present!
|
||||
self.params['page'] = '1'
|
||||
self.params['rows'] = '1'
|
||||
self.params['output'] = 'json'
|
||||
r = self.session.get(self.search_url,
|
||||
params=self.params,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
if j.get('error'):
|
||||
yield j
|
||||
for agg in j.get('response', {}).get('aggregations', {}).items():
|
||||
yield {agg[0]: agg[1]}
|
||||
|
||||
@property
|
||||
def num_found(self):
|
||||
if not self._num_found:
|
||||
if not self.fts and 'page' in self.params:
|
||||
p = self.params.copy()
|
||||
p['output'] = 'json'
|
||||
r = self.session.get(self.search_url,
|
||||
params=p,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
num_found = int(j.get('response', {}).get('numFound', 0))
|
||||
if not self._num_found:
|
||||
self._num_found = num_found
|
||||
elif not self.fts:
|
||||
p = self.params.copy()
|
||||
p['total_only'] = 'true'
|
||||
r = self.session.post(self.scrape_url,
|
||||
params=p,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
self._handle_scrape_error(j)
|
||||
self._num_found = j.get('total')
|
||||
else:
|
||||
self.params['q'] = self.query
|
||||
r = self.session.get(self.fts_url,
|
||||
params=self.params,
|
||||
auth=self.auth,
|
||||
**self.request_kwargs)
|
||||
j = r.json()
|
||||
self._num_found = j.get('hits', {}).get('total')
|
||||
return self._num_found
|
||||
|
||||
def _handle_scrape_error(self, j):
|
||||
if 'error' in j:
|
||||
if all(s in j['error'].lower() for s in ['invalid', 'secret']):
|
||||
if not j['error'].endswith('.'):
|
||||
j['error'] += '.'
|
||||
raise ValueError(f"{j['error']} Try running 'ia configure' and retrying.")
|
||||
raise ValueError(j.get('error'))
|
||||
|
||||
def _get_item_from_search_result(self, search_result):
|
||||
return self.session.get_item(search_result['identifier'])
|
||||
|
||||
def iter_as_results(self):
|
||||
return SearchIterator(self, self._make_results_generator())
|
||||
|
||||
def iter_as_items(self):
|
||||
_map = map(self._get_item_from_search_result, self._make_results_generator())
|
||||
return SearchIterator(self, _map)
|
||||
|
||||
def __len__(self):
|
||||
return self.num_found
|
||||
|
||||
|
||||
class SearchIterator:
|
||||
"""This class is an iterator wrapper for search results.
|
||||
|
||||
It provides access to the underlying Search, and supports
|
||||
len() (since that is known initially)."""
|
||||
|
||||
def __init__(self, search, iterator):
|
||||
self.search = search
|
||||
self.iterator = iterator
|
||||
|
||||
def __len__(self):
|
||||
return self.search.num_found
|
||||
|
||||
def __next__(self):
|
||||
return next(self.iterator)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}({self.search!r}, {self.iterator!r})'
|
Loading…
Add table
Add a link
Reference in a new issue