add rk
authorClaromes <clarissamendes@alunos.utfpr.edu.br>
Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
committerClaromes <clarissamendes@alunos.utfpr.edu.br>
Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
docs/streamlit.rst
waybacktweets/_cli.py
waybacktweets/api/parse.py
waybacktweets/api/request.py
waybacktweets/config/field_options.py

index 1c5f25acbe96be8aefec0311f19180c0efe9e348..bf748e2f0792ea9f760433fa2b979eac7beb9fef 100644 (file)
@@ -13,7 +13,7 @@ Filters
 
 - Limit: Query result limits.
 
-- Offset: Allows for a simple way to scroll through the results.
+- Resumption Key: Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query.
 
 - Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
 
index f003efcd23658eacffde0a1da22afed6c418b0e9..231d243b1d0fd283d4d33331bdf17234865d4562 100644 (file)
@@ -77,12 +77,11 @@ def _parse_date(
     help="Query result limits.",
 )
 @click.option(
-    "-o",
-    "--offset",
-    type=int,
-    metavar="INTEGER",
+    "-rk",
+    "--resumption_key",
+    type=str,
     default=None,
-    help="Allows for a simple way to scroll through the results.",
+    help="Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query.",  # noqa: E501
 )
 @click.option(
     "-mt",
@@ -105,7 +104,7 @@ def main(
     timestamp_from: Optional[str],
     timestamp_to: Optional[str],
     limit: Optional[int],
-    offset: Optional[int],
+    resumption_key: Optional[str],
     matchtype: Optional[str],
     verbose: Optional[bool],
 ) -> None:
@@ -118,7 +117,13 @@ def main(
         config.verbose = verbose
 
         api = WaybackTweets(
-            username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
+            username,
+            collapse,
+            timestamp_from,
+            timestamp_to,
+            limit,
+            resumption_key,
+            matchtype,
         )
 
         print(f"Waybacking @{username}'s archived tweets...")
@@ -140,6 +145,7 @@ def main(
                 "archived_statuscode",
                 "archived_digest",
                 "archived_length",
+                "resumption_key",
             ]
 
             parser = TweetsParser(archived_tweets, username, field_options)
index 31e0e3430b10b62e137ebf62ad9fe937f3c2a482..eefa605a4393e539b9986b41b5de7e9d04323e0a 100644 (file)
@@ -178,6 +178,33 @@ class TweetsParser:
         self.field_options = field_options
         self.parsed_tweets = {option: [] for option in self.field_options}
 
+        if "resumption_key" not in self.parsed_tweets:
+            self.parsed_tweets["resumption_key"] = []
+
+        self._add_resumption_key()
+
+    def _add_resumption_key(self):
+        """Adds the resumption key from the last archived tweet response to the parsed tweets.
+
+        This method extracts the resumption key from the last item in the archived tweets response list
+        and appends it to the 'resumption_key' field in the parsed tweets dictionary. It also prints
+        the resumption key with instructions on how to use it with the 'limit' option for continuing
+        the query from the end of the previous query.
+
+        Raises:
+            ValueError: If the list of archived tweet responses is empty.
+
+        """  # noqa: E501
+        if not self.archived_tweets_response:
+            raise ValueError("The list of archived tweet responses is empty.")
+
+        resumption_key = self.archived_tweets_response[-1][0]
+        self.parsed_tweets["resumption_key"].append(resumption_key)
+
+        rprint(
+            f'[blue]\nResumption Key: [bold]{resumption_key}[/bold]\nIf you are using the "limit" option, use this key in the "resumption_key" option and continue the query from the end of the previous query.\n'  # noqa: E501
+        )
+
     def _add_field(self, key: str, value: Any) -> None:
         """
         Appends a value to a list in the parsed data structure.
@@ -286,6 +313,8 @@ class TweetsParser:
                 for future in as_completed(futures):
                     try:
                         future.result()
+                    except IndexError:
+                        pass
                     except Exception as e:
                         rprint(f"[red]{e}")
 
index 3797fb4403d7e0473d4cd0949d51fdaf6839ea73..a503566e964c5e895e0c0e51c96b261b140ed866 100644 (file)
@@ -27,7 +27,7 @@ class WaybackTweets:
         timestamp_from (str, optional): The timestamp to start retrieving tweets from.
         timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
         limit (int, optional): The maximum number of results to return.
-        offset (int, optional): The number of lines to skip in the results.
+        resumption_key (int, optional): Key to continue the query from the end of the previous query.
         matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
     """  # noqa: E501
 
@@ -38,7 +38,7 @@ class WaybackTweets:
         timestamp_from: str = None,
         timestamp_to: str = None,
         limit: int = None,
-        offset: int = None,
+        resumption_key: str = None,
         matchtype: str = None,
     ):
         self.username = username
@@ -46,7 +46,7 @@ class WaybackTweets:
         self.timestamp_from = timestamp_from
         self.timestamp_to = timestamp_to
         self.limit = limit
-        self.offset = offset
+        self.resumption_key = resumption_key
         self.matchtype = matchtype
 
     def get(self) -> Optional[Dict[str, Any]]:
@@ -64,6 +64,7 @@ class WaybackTweets:
 
         params = {
             "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
+            "showResumeKey": "true",
             "output": "json",
         }
 
@@ -79,8 +80,8 @@ class WaybackTweets:
         if self.limit:
             params["limit"] = self.limit
 
-        if self.offset:
-            params["offset"] = self.offset
+        if self.resumption_key:
+            params["resumption_key"] = self.resumption_key
 
         if self.matchtype:
             params["matchType"] = self.matchtype
index 1d36f031c6f85f9e7af901846e7e53cdadaaa193..3cd2e4b5c552fddabcf1277bf2237e999edfe2d6 100644 (file)
@@ -17,4 +17,5 @@ FIELD_OPTIONS = [
     "archived_statuscode",
     "archived_digest",
     "archived_length",
+    "resumption_key",
 ]