add collapse option
authorClaromes <claromes@hey.com>
Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
committerClaromes <claromes@hey.com>
Fri, 14 Jun 2024 08:23:33 +0000 (05:23 -0300)
app/app.py
app/requirements.txt [new file with mode: 0644]
docs/_static/css/custom.css
waybacktweets/api/export_tweets.py
waybacktweets/api/request_tweets.py
waybacktweets/cli/main.py
waybacktweets/utils/utils.py

index f6520f03dd16c0e4dcfd4fcfeee4eba20839e1f9..c4a7ab73c9d7b16a2dc561dfa1057de30d0749f7 100644 (file)
@@ -108,7 +108,7 @@ def next_page():
 
 @st.cache_data(ttl=1800, show_spinner=False)
 def tweets_count(username, archived_timestamp_filter):
-    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
+    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
 
     try:
         response = get_response(url=url)
@@ -189,7 +189,7 @@ if query or st.session_state.count:
     )
 
     st.caption(
-        "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
+        "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
     )
     st.write(f"**{st.session_state.count} URLs have been captured**")
 
@@ -202,9 +202,13 @@ if query or st.session_state.count:
 
         # Tweet Listing Processing
 
+        collapse = None
+        if unique:
+            collapse = "urlkey"
+
         response = WaybackTweets(
             username,
-            unique,
+            collapse,
             st.session_state.archived_timestamp_filter[0],
             st.session_state.archived_timestamp_filter[1],
             tweets_per_page,
diff --git a/app/requirements.txt b/app/requirements.txt
new file mode 100644 (file)
index 0000000..7b81516
--- /dev/null
@@ -0,0 +1,3 @@
+requests>=2.30.0
+streamlit==1.35.0
+waybacktweets>=1.0
index ad27dca2847a62dca5602c4acfcb26e3290c1421..d3c7d77ae594316e8f863244e3849885f6deb2d7 100644 (file)
@@ -1,3 +1,4 @@
-#cli #usage #wbt h3 {
+#cli #usage #wbt h3,
+.sphinxsidebarwrapper li ul li ul:has(a[href="#wbt"]):last-child{
     display: none;
 }
index 802f3ba9248a6cdb04e3ba2457aeef5d1a287670..6f38351dc3d0f574ba0f70294a993e074539773b 100644 (file)
@@ -16,7 +16,7 @@ class TweetsExporter:
         self.field_options = field_options
         self.formatted_datetime = self._datetime_now()
         self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
-        self.dataframe = self._create_dataframe(self)
+        self.dataframe = self._create_dataframe()
 
     @staticmethod
     def _datetime_now():
index 4260c3e803442be4d3af4d1373e3d61053147e4d..72eaf89f952a5cf492c0968500d7ee80f9feb1c2 100644 (file)
@@ -7,9 +7,9 @@ from waybacktweets.utils.utils import get_response
 class WaybackTweets:
     """Requests data from the Wayback CDX Server API and returns it in JSON format."""
 
-    def __init__(self, username, unique, timestamp_from, timestamp_to, limit, offset):
+    def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset):
         self.username = username
-        self.unique = unique
+        self.collapse = collapse
         self.timestamp_from = timestamp_from
         self.timestamp_to = timestamp_to
         self.limit = limit
@@ -23,8 +23,8 @@ class WaybackTweets:
             "output": "json",
         }
 
-        if self.unique:
-            params["collapse"] = "urlkey"
+        if self.collapse:
+            params["collapse"] = self.collapse
 
         if self.timestamp_from:
             params["from"] = self.timestamp_from
index 7a606027914be73285e87153dff45eac0ec1f773..7d01d05b4693ac74909726c7025aafa45f128eb2 100644 (file)
@@ -17,26 +17,28 @@ from waybacktweets.utils.utils import parse_date
 @click.command()
 @click.argument("username", type=str)
 @click.option(
-    "--unique",
-    type=bool,
-    default=False,
-    help="Only show unique URLs. Filtering by the collapse option using the urlkey field.",  # noqa: E501
+    "--collapse",
+    type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
+    default=None,
+    help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.",  # noqa: E501
 )
 @click.option(
     "--from",
     "timestamp_from",
     type=click.UNPROCESSED,
+    metavar="DATE",
     callback=parse_date,
     default=None,
-    help="Filtering by date range from this date.",
+    help="Filtering by date range from this date. Format: YYYYmmdd",
 )
 @click.option(
     "--to",
     "timestamp_to",
     type=click.UNPROCESSED,
+    metavar="DATE",
     callback=parse_date,
     default=None,
-    help="Filtering by date range up to this date.",
+    help="Filtering by date range up to this date. Format: YYYYmmdd",
 )
 @click.option("--limit", type=int, default=None, help="Query result limits.")
 @click.option(
@@ -47,21 +49,21 @@ from waybacktweets.utils.utils import parse_date
 )
 def cli(
     username: str,
-    unique: bool,
+    collapse: Optional[str],
     timestamp_from: Optional[str],
     timestamp_to: Optional[str],
     limit: Optional[int],
     offset: Optional[int],
 ) -> None:
     """
-    Retrieves archived tweets' CDX data from the Wayback Machine,
+    Retrieves archived tweets CDX data from the Wayback Machine,
     performs necessary parsing, and saves the data.
 
     USERNAME: The Twitter username without @.
     """
     try:
         api = WaybackTweets(
-            username, unique, timestamp_from, timestamp_to, limit, offset
+            username, collapse, timestamp_from, timestamp_to, limit, offset
         )
         archived_tweets = api.get()
 
@@ -91,7 +93,6 @@ def cli(
             exporter.save_to_csv()
             exporter.save_to_json()
             exporter.save_to_html()
-
     except exceptions as e:
         rprint(f"[red]{e}")
     finally:
index 7c4d1ca69d0f9e2749683dbf7cada7e5abe00d17..eff374a3c016e861ac5e2cc0557d3ed30b241d40 100644 (file)
@@ -5,6 +5,7 @@ Helper functions.
 import re
 from datetime import datetime
 
+import click
 import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
@@ -137,8 +138,12 @@ def parse_date(ctx=None, param=None, value=None):
         str: The input date string formatted in the "YYYYMMDD" format,
         or None if no date string was provided.
     """
-    if value is None:
-        return None
+    try:
+        if value is None:
+            return None
 
-    date = datetime.strptime(value, "%Y%m%d")
-    return date.strftime("%Y%m%d")
+        date = datetime.strptime(value, "%Y%m%d")
+
+        return date.strftime("%Y%m%d")
+    except ValueError:
+        raise click.BadParameter("Date must be in format YYYYmmdd")