From 88ef5e2c472f4b55713d65abec7646ecf2869c15 Mon Sep 17 00:00:00 2001 From: Claromes Date: Fri, 14 Jun 2024 19:05:28 -0300 Subject: [PATCH] add print option and review cli doc --- docs/_static/css/custom.css | 3 ++- docs/cli.rst | 33 +++++++++++++++++++++++++++++ docs/todo.rst | 6 +++++- waybacktweets/api/parse_tweets.py | 19 +++++++++++------ waybacktweets/api/request_tweets.py | 2 -- waybacktweets/cli/main.py | 6 ++++-- 6 files changed, 57 insertions(+), 12 deletions(-) diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css index e2cc030..6429ee5 100644 --- a/docs/_static/css/custom.css +++ b/docs/_static/css/custom.css @@ -1,4 +1,5 @@ #cli #usage #waybacktweets h3, -.sphinxsidebarwrapper li ul li ul:has(a[href="#waybacktweets"]):last-child{ +#cli .admonition-title, +.sphinxsidebarwrapper li ul li ul:has(a[href="#waybacktweets"]):last-child { display: none; } diff --git a/docs/cli.rst b/docs/cli.rst index 9276250..49abef2 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -7,3 +7,36 @@ Usage .. click:: waybacktweets.cli.main:cli :prog: waybacktweets :nested: full + +Collapsing +------------ + +The Wayback Tweets command line tool recommends the use of three types of "collapse": ``urlkey``, ``digest``, and ``timestamp`` field. + +- ``urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing. + +- ``digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string. + +- ``timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format. We recommend ``YYYYMMDD``. + +However, it is possible to use it with other options. Read below text extracted from the official Wayback CDX Server API (Beta) documentation. + +.. note:: + + A new form of filtering is the option to "collapse" results based on a field, or a substring of a field. Collapsing is done on adjacent CDX lines where all captures after the first one that are duplicate are filtered out. This is useful for filtering out captures that are "too dense" or when looking for unique captures. + + To use collapsing, add one or more ``collapse=field`` or ``collapse=field:N`` where ``N`` is the first ``N`` characters of field to test. + + - Ex: Only show at most 1 capture per hour (compare the first 10 digits of the ``timestamp`` field). Given 2 captures ``20130226010000`` and ``20130226010800``, since first 10 digits ``2013022601`` match, the 2nd capture will be filtered out: + + http://web.archive.org/cdx/search/cdx?url=google.com&collapse=timestamp:10 + + The calendar page at `web.archive.org` uses this filter by default: `http://web.archive.org/web/*/archive.org` + + - Ex: Only show unique captures by ``digest`` (note that only adjacent digest are collapsed, duplicates elsewhere in the cdx are not affected): + + http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=digest + + - Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment): + + http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix diff --git a/docs/todo.rst b/docs/todo.rst index 4b2caa1..58d1feb 100644 --- a/docs/todo.rst +++ b/docs/todo.rst @@ -5,5 +5,9 @@ TODO -|uncheck| JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting. +|uncheck| Code: JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting + +|uncheck| Docs: Add tutorial on how to save Tweet via command line + +|uncheck| Web App: Return complete JSON when mimetype is ``application/json`` diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py index 5c8adcb..3912590 100644 --- a/waybacktweets/api/parse_tweets.py +++ b/waybacktweets/api/parse_tweets.py @@ -1,5 +1,6 @@ import re from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import nullcontext from typing import Any, Dict, List, Optional, Tuple from urllib.parse import unquote @@ -231,10 +232,11 @@ class TweetsParser: self._add_field("archived_digest", response[5]) self._add_field("archived_length", response[6]) - def parse(self) -> Dict[str, List[Any]]: + def parse(self, print_progress=False) -> Dict[str, List[Any]]: """ Parses the archived tweets CDX data and structures it. + :param print_progress: A boolean indicating whether to print progress or not. :returns: The parsed tweets data. """ with ThreadPoolExecutor(max_workers=10) as executor: @@ -243,10 +245,14 @@ class TweetsParser: executor.submit(self._process_response, response): response for response in self.archived_tweets_response[1:] } - with Progress() as progress: - task = progress.add_task( - f"Waybacking @{self.username} tweets\n", total=len(futures) - ) + + progress_context = Progress() if print_progress else nullcontext() + with progress_context as progress: + task = None + if print_progress: + task = progress.add_task( + f"Waybacking @{self.username} tweets\n", total=len(futures) + ) for future in as_completed(futures): try: @@ -254,6 +260,7 @@ class TweetsParser: except Exception as e: rprint(f"[red]{e}") - progress.update(task, advance=1) + if print_progress: + progress.update(task, advance=1) return self.parsed_tweets diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py index baebe3a..07e9903 100644 --- a/waybacktweets/api/request_tweets.py +++ b/waybacktweets/api/request_tweets.py @@ -62,8 +62,6 @@ class WaybackTweets: if self.offset: params["offset"] = self.offset - print("Making a request to the Internet Archive...") - try: response = get_response(url=url, params=params) diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py index 421980d..96509c5 100644 --- a/waybacktweets/cli/main.py +++ b/waybacktweets/cli/main.py @@ -65,6 +65,8 @@ def cli( api = WaybackTweets( username, collapse, timestamp_from, timestamp_to, limit, offset ) + + print("Making a request to the Internet Archive...") archived_tweets = api.get() if archived_tweets: @@ -86,7 +88,7 @@ def cli( ] parser = TweetsParser(archived_tweets, username, field_options) - parsed_tweets = parser.parse() + parsed_tweets = parser.parse(print_progress=True) exporter = TweetsExporter(parsed_tweets, username, field_options) @@ -97,5 +99,5 @@ def cli( rprint(f"[red]{e}") finally: rprint( - "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues" # noqa: E501 + "[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501 ) -- 2.34.1