Post-Local sync at 2025-06-23T22:46:07Z

2025-06-23 17:55:02 -05:00 · 2025-06-23 17:55:02 -05:00 · 9f97801b0d
commit 9f97801b0d
parent 9d33b42020
1387 changed files with 250216 additions and 117 deletions
--- a/.venv/lib/python3.12/site-packages/internetarchive/search.py
+++ b/.venv/lib/python3.12/site-packages/internetarchive/search.py
@ -0,0 +1,304 @@
+#
+# The internetarchive module is a Python/CLI interface to Archive.org.
+#
+# Copyright (C) 2012-2024 Internet Archive
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""
+internetarchive.search
+~~~~~~~~~~~~~~~~~~~~~~
+
+This module provides objects for interacting with the Archive.org
+search engine.
+
+:copyright: (C) 2012-2024 by Internet Archive.
+:license: AGPL 3, see LICENSE for more details.
+"""
+import itertools
+from logging import getLogger
+
+from requests.exceptions import ReadTimeout
+
+from internetarchive.auth import S3Auth
+
+log = getLogger(__name__)
+
+
+class Search:
+    """This class represents an archive.org item search. You can use
+    this class to search for Archive.org items using the advanced search
+    engine.
+
+    Usage::
+
+        >>> from internetarchive.session import ArchiveSession
+        >>> from internetarchive.search import Search
+        >>> s = ArchiveSession()
+        >>> search = Search(s, '(uploader:jake@archive.org)')
+        >>> for result in search:
+        ...     print(result['identifier'])
+    """
+
+    def __init__(self, archive_session, query,
+                 fields=None,
+                 sorts=None,
+                 params=None,
+                 full_text_search=None,
+                 dsl_fts=None,
+                 request_kwargs=None,
+                 max_retries=None):
+        params = params or {}
+
+        self.session = archive_session
+        self.dsl_fts = False if not dsl_fts else True
+        if self.dsl_fts or full_text_search:
+            self.fts = True
+        else:
+            self.fts = False
+        self.query = query
+        if self.fts and not self.dsl_fts:
+            self.query = f'!L {self.query}'
+        self.fields = fields or []
+        self.sorts = sorts or []
+        self.request_kwargs = request_kwargs or {}
+        self._num_found = None
+        self.fts_url = f'{self.session.protocol}//be-api.us.archive.org/ia-pub-fts-api'
+        self.scrape_url = f'{self.session.protocol}//{self.session.host}/services/search/v1/scrape'
+        self.search_url = f'{self.session.protocol}//{self.session.host}/advancedsearch.php'
+        if self.session.access_key and self.session.secret_key:
+            self.auth = S3Auth(self.session.access_key, self.session.secret_key)
+        else:
+            self.auth = None
+        self.max_retries = max_retries if max_retries is not None else 5
+
+        # Initialize params.
+        default_params = {'q': self.query}
+        if 'page' not in params:
+            if 'rows' in params:
+                params['page'] = 1
+            else:
+                default_params['count'] = 10000
+        else:
+            default_params['output'] = 'json'
+        # In the beta endpoint 'scope' was called 'index'.
+        # Let's support both for a while.
+        if 'index' in params:
+            params['scope'] = params['index']
+            del params['index']
+        self.params = default_params.copy()
+        self.params.update(params)
+
+        # Set timeout.
+        if 'timeout' not in self.request_kwargs:
+            self.request_kwargs['timeout'] = 300
+
+        # Set retries.
+        self.session.mount_http_adapter(max_retries=self.max_retries)
+
+    def __repr__(self):
+        return f'Search(query={self.query!r})'
+
+    def __iter__(self):
+        return self.iter_as_results()
+
+    def _advanced_search(self):
+        # Always return identifier.
+        if 'identifier' not in self.fields:
+            self.fields.append('identifier')
+        for k, v in enumerate(self.fields):
+            self.params[f'fl[{k}]'] = v
+
+        for i, field in enumerate(self.sorts):
+            self.params[f'sort[{i}]'] = field
+
+        self.params['output'] = 'json'
+
+        r = self.session.get(self.search_url,
+                             params=self.params,
+                             auth=self.auth,
+                             **self.request_kwargs)
+        j = r.json()
+        num_found = int(j.get('response', {}).get('numFound', 0))
+        if not self._num_found:
+            self._num_found = num_found
+        if j.get('error'):
+            yield j
+        yield from j.get('response', {}).get('docs', [])
+
+    def _scrape(self):
+        if self.fields:
+            self.params['fields'] = ','.join(self.fields)
+        if self.sorts:
+            self.params['sorts'] = ','.join(self.sorts)
+        i = 0
+        num_found = None
+        while True:
+            r = self.session.post(self.scrape_url,
+                                  params=self.params,
+                                  auth=self.auth,
+                                  **self.request_kwargs)
+            j = r.json()
+            if j.get('error'):
+                yield j
+            if not num_found:
+                num_found = int(j.get('total') or '0')
+            if not self._num_found:
+                self._num_found = num_found
+            self._handle_scrape_error(j)
+
+            self.params['cursor'] = j.get('cursor')
+            for item in j['items']:
+                i += 1
+                yield item
+            if 'cursor' not in j:
+                if i != num_found:
+                    raise ReadTimeout('The server failed to return results in the'
+                                      f' allotted amount of time for {r.request.url}')
+                break
+
+    def _full_text_search(self):
+        d = {
+            'q': self.query,
+            'size': '10000',
+            'from': '0',
+            'scroll': 'true',
+        }
+
+        if 'scope' in self.params:
+            d['scope'] = self.params['scope']
+
+        if 'size' in self.params:
+            d['scroll'] = False
+            d['size'] = self.params['size']
+
+        while True:
+            r = self.session.post(self.fts_url,
+                                  json=d,
+                                  auth=self.auth,
+                                  **self.request_kwargs)
+            j = r.json()
+            scroll_id = j.get('_scroll_id')
+            hits = j.get('hits', {}).get('hits')
+            if not hits:
+                return
+            yield from hits
+            if not hits or d['scroll'] is False:
+                break
+            d['scroll_id'] = scroll_id
+
+    def _make_results_generator(self):
+        if self.fts:
+            return self._full_text_search()
+        if 'user_aggs' in self.params:
+            return self._user_aggs()
+        elif 'page' in self.params:
+            return self._advanced_search()
+        else:
+            return self._scrape()
+
+    def _user_aggs(self):
+        """Experimental support for user aggregations.
+        """
+        del self.params['count']  # advanced search will error if this param is present!
+        self.params['page'] = '1'
+        self.params['rows'] = '1'
+        self.params['output'] = 'json'
+        r = self.session.get(self.search_url,
+                             params=self.params,
+                             auth=self.auth,
+                             **self.request_kwargs)
+        j = r.json()
+        if j.get('error'):
+            yield j
+        for agg in j.get('response', {}).get('aggregations', {}).items():
+            yield {agg[0]: agg[1]}
+
+    @property
+    def num_found(self):
+        if not self._num_found:
+            if not self.fts and 'page' in self.params:
+                p = self.params.copy()
+                p['output'] = 'json'
+                r = self.session.get(self.search_url,
+                                     params=p,
+                                     auth=self.auth,
+                                     **self.request_kwargs)
+                j = r.json()
+                num_found = int(j.get('response', {}).get('numFound', 0))
+                if not self._num_found:
+                    self._num_found = num_found
+            elif not self.fts:
+                p = self.params.copy()
+                p['total_only'] = 'true'
+                r = self.session.post(self.scrape_url,
+                                      params=p,
+                                      auth=self.auth,
+                                      **self.request_kwargs)
+                j = r.json()
+                self._handle_scrape_error(j)
+                self._num_found = j.get('total')
+            else:
+                self.params['q'] = self.query
+                r = self.session.get(self.fts_url,
+                                     params=self.params,
+                                     auth=self.auth,
+                                     **self.request_kwargs)
+                j = r.json()
+                self._num_found = j.get('hits', {}).get('total')
+        return self._num_found
+
+    def _handle_scrape_error(self, j):
+        if 'error' in j:
+            if all(s in j['error'].lower() for s in ['invalid', 'secret']):
+                if not j['error'].endswith('.'):
+                    j['error'] += '.'
+                raise ValueError(f"{j['error']} Try running 'ia configure' and retrying.")
+            raise ValueError(j.get('error'))
+
+    def _get_item_from_search_result(self, search_result):
+        return self.session.get_item(search_result['identifier'])
+
+    def iter_as_results(self):
+        return SearchIterator(self, self._make_results_generator())
+
+    def iter_as_items(self):
+        _map = map(self._get_item_from_search_result, self._make_results_generator())
+        return SearchIterator(self, _map)
+
+    def __len__(self):
+        return self.num_found
+
+
+class SearchIterator:
+    """This class is an iterator wrapper for search results.
+
+    It provides access to the underlying Search, and supports
+    len() (since that is known initially)."""
+
+    def __init__(self, search, iterator):
+        self.search = search
+        self.iterator = iterator
+
+    def __len__(self):
+        return self.search.num_found
+
+    def __next__(self):
+        return next(self.iterator)
+
+    def __iter__(self):
+        return self
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({self.search!r}, {self.iterator!r})'