#cli #usage #waybacktweets h3,
-.sphinxsidebarwrapper li ul li ul:has(a[href="#waybacktweets"]):last-child{
+#cli .admonition-title,
+.sphinxsidebarwrapper li ul li ul:has(a[href="#waybacktweets"]):last-child {
display: none;
}
.. click:: waybacktweets.cli.main:cli
:prog: waybacktweets
:nested: full
+
+Collapsing
+------------
+
+The Wayback Tweets command line tool recommends the use of three types of "collapse": ``urlkey``, ``digest``, and ``timestamp`` field.
+
+- ``urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
+
+- ``digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string.
+
+- ``timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format. We recommend ``YYYYMMDD``.
+
+However, it is possible to use it with other options. Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
+
+.. note::
+
+ A new form of filtering is the option to "collapse" results based on a field, or a substring of a field. Collapsing is done on adjacent CDX lines where all captures after the first one that are duplicate are filtered out. This is useful for filtering out captures that are "too dense" or when looking for unique captures.
+
+ To use collapsing, add one or more ``collapse=field`` or ``collapse=field:N`` where ``N`` is the first ``N`` characters of field to test.
+
+ - Ex: Only show at most 1 capture per hour (compare the first 10 digits of the ``timestamp`` field). Given 2 captures ``20130226010000`` and ``20130226010800``, since first 10 digits ``2013022601`` match, the 2nd capture will be filtered out:
+
+ http://web.archive.org/cdx/search/cdx?url=google.com&collapse=timestamp:10
+
+ The calendar page at `web.archive.org` uses this filter by default: `http://web.archive.org/web/*/archive.org`
+
+ - Ex: Only show unique captures by ``digest`` (note that only adjacent digest are collapsed, duplicates elsewhere in the cdx are not affected):
+
+ http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=digest
+
+ - Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment):
+
+ http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix
<input type="checkbox">
-|uncheck| JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting.
+|uncheck| Code: JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting
+
+|uncheck| Docs: Add tutorial on how to save Tweet via command line
+
+|uncheck| Web App: Return complete JSON when mimetype is ``application/json``
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import nullcontext
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import unquote
self._add_field("archived_digest", response[5])
self._add_field("archived_length", response[6])
- def parse(self) -> Dict[str, List[Any]]:
+ def parse(self, print_progress=False) -> Dict[str, List[Any]]:
"""
Parses the archived tweets CDX data and structures it.
+ :param print_progress: A boolean indicating whether to print progress or not.
:returns: The parsed tweets data.
"""
with ThreadPoolExecutor(max_workers=10) as executor:
executor.submit(self._process_response, response): response
for response in self.archived_tweets_response[1:]
}
- with Progress() as progress:
- task = progress.add_task(
- f"Waybacking @{self.username} tweets\n", total=len(futures)
- )
+
+ progress_context = Progress() if print_progress else nullcontext()
+ with progress_context as progress:
+ task = None
+ if print_progress:
+ task = progress.add_task(
+ f"Waybacking @{self.username} tweets\n", total=len(futures)
+ )
for future in as_completed(futures):
try:
except Exception as e:
rprint(f"[red]{e}")
- progress.update(task, advance=1)
+ if print_progress:
+ progress.update(task, advance=1)
return self.parsed_tweets
if self.offset:
params["offset"] = self.offset
- print("Making a request to the Internet Archive...")
-
try:
response = get_response(url=url, params=params)
api = WaybackTweets(
username, collapse, timestamp_from, timestamp_to, limit, offset
)
+
+ print("Making a request to the Internet Archive...")
archived_tweets = api.get()
if archived_tweets:
]
parser = TweetsParser(archived_tweets, username, field_options)
- parsed_tweets = parser.parse()
+ parsed_tweets = parser.parse(print_progress=True)
exporter = TweetsExporter(parsed_tweets, username, field_options)
rprint(f"[red]{e}")
finally:
rprint(
- "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues" # noqa: E501
+ "[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501
)