From: Claromes Date: Tue, 14 Jan 2025 12:20:42 +0000 (-0300) Subject: add rk X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=704f07348b9d57455ab0ceba406f22fd883aa03b;p=waybacktweets.git add rk --- diff --git a/docs/streamlit.rst b/docs/streamlit.rst index 1c5f25a..bf748e2 100644 --- a/docs/streamlit.rst +++ b/docs/streamlit.rst @@ -13,7 +13,7 @@ Filters - Limit: Query result limits. -- Offset: Allows for a simple way to scroll through the results. +- Resumption Key: Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query. - Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix`` diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index f003efc..231d243 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -77,12 +77,11 @@ def _parse_date( help="Query result limits.", ) @click.option( - "-o", - "--offset", - type=int, - metavar="INTEGER", + "-rk", + "--resumption_key", + type=str, default=None, - help="Allows for a simple way to scroll through the results.", + help="Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query.", # noqa: E501 ) @click.option( "-mt", @@ -105,7 +104,7 @@ def main( timestamp_from: Optional[str], timestamp_to: Optional[str], limit: Optional[int], - offset: Optional[int], + resumption_key: Optional[str], matchtype: Optional[str], verbose: Optional[bool], ) -> None: @@ -118,7 +117,13 @@ def main( config.verbose = verbose api = WaybackTweets( - username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype + username, + collapse, + timestamp_from, + timestamp_to, + limit, + resumption_key, + matchtype, ) print(f"Waybacking @{username}'s archived tweets...") @@ -140,6 +145,7 @@ def main( "archived_statuscode", "archived_digest", "archived_length", + "resumption_key", ] parser = TweetsParser(archived_tweets, username, field_options) diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py index 31e0e34..eefa605 100644 --- a/waybacktweets/api/parse.py +++ b/waybacktweets/api/parse.py @@ -178,6 +178,33 @@ class TweetsParser: self.field_options = field_options self.parsed_tweets = {option: [] for option in self.field_options} + if "resumption_key" not in self.parsed_tweets: + self.parsed_tweets["resumption_key"] = [] + + self._add_resumption_key() + + def _add_resumption_key(self): + """Adds the resumption key from the last archived tweet response to the parsed tweets. + + This method extracts the resumption key from the last item in the archived tweets response list + and appends it to the 'resumption_key' field in the parsed tweets dictionary. It also prints + the resumption key with instructions on how to use it with the 'limit' option for continuing + the query from the end of the previous query. + + Raises: + ValueError: If the list of archived tweet responses is empty. + + """ # noqa: E501 + if not self.archived_tweets_response: + raise ValueError("The list of archived tweet responses is empty.") + + resumption_key = self.archived_tweets_response[-1][0] + self.parsed_tweets["resumption_key"].append(resumption_key) + + rprint( + f'[blue]\nResumption Key: [bold]{resumption_key}[/bold]\nIf you are using the "limit" option, use this key in the "resumption_key" option and continue the query from the end of the previous query.\n' # noqa: E501 + ) + def _add_field(self, key: str, value: Any) -> None: """ Appends a value to a list in the parsed data structure. @@ -286,6 +313,8 @@ class TweetsParser: for future in as_completed(futures): try: future.result() + except IndexError: + pass except Exception as e: rprint(f"[red]{e}") diff --git a/waybacktweets/api/request.py b/waybacktweets/api/request.py index 3797fb4..a503566 100644 --- a/waybacktweets/api/request.py +++ b/waybacktweets/api/request.py @@ -27,7 +27,7 @@ class WaybackTweets: timestamp_from (str, optional): The timestamp to start retrieving tweets from. timestamp_to (str, optional): The timestamp to stop retrieving tweets at. limit (int, optional): The maximum number of results to return. - offset (int, optional): The number of lines to skip in the results. + resumption_key (int, optional): Key to continue the query from the end of the previous query. matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains. """ # noqa: E501 @@ -38,7 +38,7 @@ class WaybackTweets: timestamp_from: str = None, timestamp_to: str = None, limit: int = None, - offset: int = None, + resumption_key: str = None, matchtype: str = None, ): self.username = username @@ -46,7 +46,7 @@ class WaybackTweets: self.timestamp_from = timestamp_from self.timestamp_to = timestamp_to self.limit = limit - self.offset = offset + self.resumption_key = resumption_key self.matchtype = matchtype def get(self) -> Optional[Dict[str, Any]]: @@ -64,6 +64,7 @@ class WaybackTweets: params = { "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}", + "showResumeKey": "true", "output": "json", } @@ -79,8 +80,8 @@ class WaybackTweets: if self.limit: params["limit"] = self.limit - if self.offset: - params["offset"] = self.offset + if self.resumption_key: + params["resumption_key"] = self.resumption_key if self.matchtype: params["matchType"] = self.matchtype diff --git a/waybacktweets/config/field_options.py b/waybacktweets/config/field_options.py index 1d36f03..3cd2e4b 100644 --- a/waybacktweets/config/field_options.py +++ b/waybacktweets/config/field_options.py @@ -17,4 +17,5 @@ FIELD_OPTIONS = [ "archived_statuscode", "archived_digest", "archived_length", + "resumption_key", ]