add rk

author Claromes <clarissamendes@alunos.utfpr.edu.br>

Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)

committer Claromes <clarissamendes@alunos.utfpr.edu.br>

Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
author Claromes <clarissamendes@alunos.utfpr.edu.br>
Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
committer Claromes <clarissamendes@alunos.utfpr.edu.br>
Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
diff --git a/docs/streamlit.rst b/docs/streamlit.rst

index 1c5f25acbe96be8aefec0311f19180c0efe9e348..bf748e2f0792ea9f760433fa2b979eac7beb9fef 100644 (file)
--- a/docs/streamlit.rst
+++ b/docs/streamlit.rst
@@ -13,7 +13,7 @@ Filters
  
  - Limit: Query result limits.
  
-- Offset: Allows for a simple way to scroll through the results.
+- Resumption Key: Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query.
  
  - Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
  
diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py

index f003efcd23658eacffde0a1da22afed6c418b0e9..231d243b1d0fd283d4d33331bdf17234865d4562 100644 (file)
--- a/waybacktweets/_cli.py
+++ b/waybacktweets/_cli.py
@@ -77,12 +77,11 @@ def _parse_date(
      help="Query result limits.",
  )
  @click.option(
-    "-o",
-    "--offset",
-    type=int,
-    metavar="INTEGER",
+    "-rk",
+    "--resumption_key",
+    type=str,
      default=None,
-    help="Allows for a simple way to scroll through the results.",
+    help="Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query.",  # noqa: E501
  )
  @click.option(
      "-mt",
@@ -105,7 +104,7 @@ def main(
      timestamp_from: Optional[str],
      timestamp_to: Optional[str],
      limit: Optional[int],
-    offset: Optional[int],
+    resumption_key: Optional[str],
      matchtype: Optional[str],
      verbose: Optional[bool],
  ) -> None:
@@ -118,7 +117,13 @@ def main(
          config.verbose = verbose
  
          api = WaybackTweets(
-            username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
+            username,
+            collapse,
+            timestamp_from,
+            timestamp_to,
+            limit,
+            resumption_key,
+            matchtype,
          )
  
          print(f"Waybacking @{username}'s archived tweets...")
@@ -140,6 +145,7 @@ def main(
                  "archived_statuscode",
                  "archived_digest",
                  "archived_length",
+                "resumption_key",
              ]
  
              parser = TweetsParser(archived_tweets, username, field_options)
diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py

index 31e0e3430b10b62e137ebf62ad9fe937f3c2a482..eefa605a4393e539b9986b41b5de7e9d04323e0a 100644 (file)
--- a/waybacktweets/api/parse.py
+++ b/waybacktweets/api/parse.py
@@ -178,6 +178,33 @@ class TweetsParser:
          self.field_options = field_options
          self.parsed_tweets = {option: [] for option in self.field_options}
  
+        if "resumption_key" not in self.parsed_tweets:
+            self.parsed_tweets["resumption_key"] = []
+
+        self._add_resumption_key()
+
+    def _add_resumption_key(self):
+        """Adds the resumption key from the last archived tweet response to the parsed tweets.
+
+        This method extracts the resumption key from the last item in the archived tweets response list
+        and appends it to the 'resumption_key' field in the parsed tweets dictionary. It also prints
+        the resumption key with instructions on how to use it with the 'limit' option for continuing
+        the query from the end of the previous query.
+
+        Raises:
+            ValueError: If the list of archived tweet responses is empty.
+
+        """  # noqa: E501
+        if not self.archived_tweets_response:
+            raise ValueError("The list of archived tweet responses is empty.")
+
+        resumption_key = self.archived_tweets_response[-1][0]
+        self.parsed_tweets["resumption_key"].append(resumption_key)
+
+        rprint(
+            f'[blue]\nResumption Key: [bold]{resumption_key}[/bold]\nIf you are using the "limit" option, use this key in the "resumption_key" option and continue the query from the end of the previous query.\n'  # noqa: E501
+        )
+
      def _add_field(self, key: str, value: Any) -> None:
          """
          Appends a value to a list in the parsed data structure.
@@ -286,6 +313,8 @@ class TweetsParser:
                  for future in as_completed(futures):
                      try:
                          future.result()
+                    except IndexError:
+                        pass
                      except Exception as e:
                          rprint(f"[red]{e}")
  
diff --git a/waybacktweets/api/request.py b/waybacktweets/api/request.py

index 3797fb4403d7e0473d4cd0949d51fdaf6839ea73..a503566e964c5e895e0c0e51c96b261b140ed866 100644 (file)
--- a/waybacktweets/api/request.py
+++ b/waybacktweets/api/request.py
@@ -27,7 +27,7 @@ class WaybackTweets:
          timestamp_from (str, optional): The timestamp to start retrieving tweets from.
          timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
          limit (int, optional): The maximum number of results to return.
-        offset (int, optional): The number of lines to skip in the results.
+        resumption_key (int, optional): Key to continue the query from the end of the previous query.
          matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
      """  # noqa: E501
  
@@ -38,7 +38,7 @@ class WaybackTweets:
          timestamp_from: str = None,
          timestamp_to: str = None,
          limit: int = None,
-        offset: int = None,
+        resumption_key: str = None,
          matchtype: str = None,
      ):
          self.username = username
@@ -46,7 +46,7 @@ class WaybackTweets:
          self.timestamp_from = timestamp_from
          self.timestamp_to = timestamp_to
          self.limit = limit
-        self.offset = offset
+        self.resumption_key = resumption_key
          self.matchtype = matchtype
  
      def get(self) -> Optional[Dict[str, Any]]:
@@ -64,6 +64,7 @@ class WaybackTweets:
  
          params = {
              "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
+            "showResumeKey": "true",
              "output": "json",
          }
  
@@ -79,8 +80,8 @@ class WaybackTweets:
          if self.limit:
              params["limit"] = self.limit
  
-        if self.offset:
-            params["offset"] = self.offset
+        if self.resumption_key:
+            params["resumption_key"] = self.resumption_key
  
          if self.matchtype:
              params["matchType"] = self.matchtype
diff --git a/waybacktweets/config/field_options.py b/waybacktweets/config/field_options.py

index 1d36f031c6f85f9e7af901846e7e53cdadaaa193..3cd2e4b5c552fddabcf1277bf2237e999edfe2d6 100644 (file)
--- a/waybacktweets/config/field_options.py
+++ b/waybacktweets/config/field_options.py
@@ -17,4 +17,5 @@ FIELD_OPTIONS = [
      "archived_statuscode",
      "archived_digest",
      "archived_length",
+    "resumption_key",
  ]
author	Claromes <clarissamendes@alunos.utfpr.edu.br>
	Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
committer	Claromes <clarissamendes@alunos.utfpr.edu.br>
	Tue, 14 Jan 2025 12:20:42 +0000 (09:20 -0300)
docs/streamlit.rst		patch \| blob \| history
waybacktweets/_cli.py		patch \| blob \| history
waybacktweets/api/parse.py		patch \| blob \| history
waybacktweets/api/request.py		patch \| blob \| history
waybacktweets/config/field_options.py		patch \| blob \| history