add collapse option

author Claromes <claromes@hey.com>

Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)

committer Claromes <claromes@hey.com>

Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
author Claromes <claromes@hey.com>
Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
committer Claromes <claromes@hey.com>
Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
diff --git a/app/app.py b/app/app.py

index f6520f03dd16c0e4dcfd4fcfeee4eba20839e1f9..c4a7ab73c9d7b16a2dc561dfa1057de30d0749f7 100644 (file)
--- a/app/app.py
+++ b/app/app.py
@@ -108,7 +108,7 @@ def next_page():
  
  @st.cache_data(ttl=1800, show_spinner=False)
  def tweets_count(username, archived_timestamp_filter):
-    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
+    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
  
      try:
          response = get_response(url=url)
@@ -189,7 +189,7 @@ if query or st.session_state.count:
      )
  
      st.caption(
-        "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
+        "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
      )
      st.write(f"**{st.session_state.count} URLs have been captured**")
  
@@ -202,9 +202,13 @@ if query or st.session_state.count:
  
          # Tweet Listing Processing
  
+        collapse = None
+        if unique:
+            collapse = "urlkey"
+
          response = WaybackTweets(
              username,
-            unique,
+            collapse,
              st.session_state.archived_timestamp_filter[0],
              st.session_state.archived_timestamp_filter[1],
              tweets_per_page,
diff --git a/app/requirements.txt b/app/requirements.txt

new file mode 100644 (file)

index 0000000..7b81516
--- /dev/null
+++ b/app/requirements.txt
@@ -0,0 +1,3 @@
+requests>=2.30.0
+streamlit==1.35.0
+waybacktweets>=1.0
diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css

index ad27dca2847a62dca5602c4acfcb26e3290c1421..d3c7d77ae594316e8f863244e3849885f6deb2d7 100644 (file)
--- a/docs/_static/css/custom.css
+++ b/docs/_static/css/custom.css
@@ -1,3 +1,4 @@
-#cli #usage #wbt h3 {
+#cli #usage #wbt h3,
+.sphinxsidebarwrapper li ul li ul:has(a[href="#wbt"]):last-child{
      display: none;
  }
diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py

index 802f3ba9248a6cdb04e3ba2457aeef5d1a287670..6f38351dc3d0f574ba0f70294a993e074539773b 100644 (file)
--- a/waybacktweets/api/export_tweets.py
+++ b/waybacktweets/api/export_tweets.py
@@ -16,7 +16,7 @@ class TweetsExporter:
          self.field_options = field_options
          self.formatted_datetime = self._datetime_now()
          self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
-        self.dataframe = self._create_dataframe(self)
+        self.dataframe = self._create_dataframe()
  
      @staticmethod
      def _datetime_now():
diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py

index 4260c3e803442be4d3af4d1373e3d61053147e4d..72eaf89f952a5cf492c0968500d7ee80f9feb1c2 100644 (file)
--- a/waybacktweets/api/request_tweets.py
+++ b/waybacktweets/api/request_tweets.py
@@ -7,9 +7,9 @@ from waybacktweets.utils.utils import get_response
  class WaybackTweets:
      """Requests data from the Wayback CDX Server API and returns it in JSON format."""
  
-    def __init__(self, username, unique, timestamp_from, timestamp_to, limit, offset):
+    def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset):
          self.username = username
-        self.unique = unique
+        self.collapse = collapse
          self.timestamp_from = timestamp_from
          self.timestamp_to = timestamp_to
          self.limit = limit
@@ -23,8 +23,8 @@ class WaybackTweets:
              "output": "json",
          }
  
-        if self.unique:
-            params["collapse"] = "urlkey"
+        if self.collapse:
+            params["collapse"] = self.collapse
  
          if self.timestamp_from:
              params["from"] = self.timestamp_from
diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py

index 7a606027914be73285e87153dff45eac0ec1f773..7d01d05b4693ac74909726c7025aafa45f128eb2 100644 (file)
--- a/waybacktweets/cli/main.py
+++ b/waybacktweets/cli/main.py
@@ -17,26 +17,28 @@ from waybacktweets.utils.utils import parse_date
  @click.command()
  @click.argument("username", type=str)
  @click.option(
-    "--unique",
-    type=bool,
-    default=False,
-    help="Only show unique URLs. Filtering by the collapse option using the urlkey field.",  # noqa: E501
+    "--collapse",
+    type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
+    default=None,
+    help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.",  # noqa: E501
  )
  @click.option(
      "--from",
      "timestamp_from",
      type=click.UNPROCESSED,
+    metavar="DATE",
      callback=parse_date,
      default=None,
-    help="Filtering by date range from this date.",
+    help="Filtering by date range from this date. Format: YYYYmmdd",
  )
  @click.option(
      "--to",
      "timestamp_to",
      type=click.UNPROCESSED,
+    metavar="DATE",
      callback=parse_date,
      default=None,
-    help="Filtering by date range up to this date.",
+    help="Filtering by date range up to this date. Format: YYYYmmdd",
  )
  @click.option("--limit", type=int, default=None, help="Query result limits.")
  @click.option(
@@ -47,21 +49,21 @@ from waybacktweets.utils.utils import parse_date
  )
  def cli(
      username: str,
-    unique: bool,
+    collapse: Optional[str],
      timestamp_from: Optional[str],
      timestamp_to: Optional[str],
      limit: Optional[int],
      offset: Optional[int],
  ) -> None:
      """
-    Retrieves archived tweets' CDX data from the Wayback Machine,
+    Retrieves archived tweets CDX data from the Wayback Machine,
      performs necessary parsing, and saves the data.
  
      USERNAME: The Twitter username without @.
      """
      try:
          api = WaybackTweets(
-            username, unique, timestamp_from, timestamp_to, limit, offset
+            username, collapse, timestamp_from, timestamp_to, limit, offset
          )
          archived_tweets = api.get()
  
@@ -91,7 +93,6 @@ def cli(
              exporter.save_to_csv()
              exporter.save_to_json()
              exporter.save_to_html()
-
      except exceptions as e:
          rprint(f"[red]{e}")
      finally:
diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py

index 7c4d1ca69d0f9e2749683dbf7cada7e5abe00d17..eff374a3c016e861ac5e2cc0557d3ed30b241d40 100644 (file)
--- a/waybacktweets/utils/utils.py
+++ b/waybacktweets/utils/utils.py
@@ -5,6 +5,7 @@ Helper functions.
  import re
  from datetime import datetime
  
+import click
  import requests
  from requests.adapters import HTTPAdapter
  from urllib3.util.retry import Retry
@@ -137,8 +138,12 @@ def parse_date(ctx=None, param=None, value=None):
          str: The input date string formatted in the "YYYYMMDD" format,
          or None if no date string was provided.
      """
-    if value is None:
-        return None
+    try:
+        if value is None:
+            return None
  
-    date = datetime.strptime(value, "%Y%m%d")
-    return date.strftime("%Y%m%d")
+        date = datetime.strptime(value, "%Y%m%d")
+
+        return date.strftime("%Y%m%d")
+    except ValueError:
+        raise click.BadParameter("Date must be in format YYYYmmdd")
author	Claromes <claromes@hey.com>
	Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
committer	Claromes <claromes@hey.com>
	Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
app/app.py		patch \| blob \| history
app/requirements.txt	[new file with mode: 0644]	patch \| blob
docs/_static/css/custom.css		patch \| blob \| history
waybacktweets/api/export_tweets.py		patch \| blob \| history
waybacktweets/api/request_tweets.py		patch \| blob \| history
waybacktweets/cli/main.py		patch \| blob \| history
waybacktweets/utils/utils.py		patch \| blob \| history