From: Claromes Date: Fri, 14 Jun 2024 08:23:33 +0000 (-0300) Subject: add collapse option X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=e7d8d052fe92f722aa675cb77fc282740a566391;p=waybacktweets.git add collapse option --- diff --git a/app/app.py b/app/app.py index f6520f0..c4a7ab7 100644 --- a/app/app.py +++ b/app/app.py @@ -108,7 +108,7 @@ def next_page(): @st.cache_data(ttl=1800, show_spinner=False) def tweets_count(username, archived_timestamp_filter): - url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 + url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 try: response = get_response(url=url) @@ -189,7 +189,7 @@ if query or st.session_state.count: ) st.caption( - "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 + "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 ) st.write(f"**{st.session_state.count} URLs have been captured**") @@ -202,9 +202,13 @@ if query or st.session_state.count: # Tweet Listing Processing + collapse = None + if unique: + collapse = "urlkey" + response = WaybackTweets( username, - unique, + collapse, st.session_state.archived_timestamp_filter[0], st.session_state.archived_timestamp_filter[1], tweets_per_page, diff --git a/app/requirements.txt b/app/requirements.txt new file mode 100644 index 0000000..7b81516 --- /dev/null +++ b/app/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.30.0 +streamlit==1.35.0 +waybacktweets>=1.0 diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css index ad27dca..d3c7d77 100644 --- a/docs/_static/css/custom.css +++ b/docs/_static/css/custom.css @@ -1,3 +1,4 @@ -#cli #usage #wbt h3 { +#cli #usage #wbt h3, +.sphinxsidebarwrapper li ul li ul:has(a[href="#wbt"]):last-child{ display: none; } diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py index 802f3ba..6f38351 100644 --- a/waybacktweets/api/export_tweets.py +++ b/waybacktweets/api/export_tweets.py @@ -16,7 +16,7 @@ class TweetsExporter: self.field_options = field_options self.formatted_datetime = self._datetime_now() self.filename = f"{self.username}_tweets_{self.formatted_datetime}" - self.dataframe = self._create_dataframe(self) + self.dataframe = self._create_dataframe() @staticmethod def _datetime_now(): diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py index 4260c3e..72eaf89 100644 --- a/waybacktweets/api/request_tweets.py +++ b/waybacktweets/api/request_tweets.py @@ -7,9 +7,9 @@ from waybacktweets.utils.utils import get_response class WaybackTweets: """Requests data from the Wayback CDX Server API and returns it in JSON format.""" - def __init__(self, username, unique, timestamp_from, timestamp_to, limit, offset): + def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset): self.username = username - self.unique = unique + self.collapse = collapse self.timestamp_from = timestamp_from self.timestamp_to = timestamp_to self.limit = limit @@ -23,8 +23,8 @@ class WaybackTweets: "output": "json", } - if self.unique: - params["collapse"] = "urlkey" + if self.collapse: + params["collapse"] = self.collapse if self.timestamp_from: params["from"] = self.timestamp_from diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py index 7a60602..7d01d05 100644 --- a/waybacktweets/cli/main.py +++ b/waybacktweets/cli/main.py @@ -17,26 +17,28 @@ from waybacktweets.utils.utils import parse_date @click.command() @click.argument("username", type=str) @click.option( - "--unique", - type=bool, - default=False, - help="Only show unique URLs. Filtering by the collapse option using the urlkey field.", # noqa: E501 + "--collapse", + type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False), + default=None, + help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501 ) @click.option( "--from", "timestamp_from", type=click.UNPROCESSED, + metavar="DATE", callback=parse_date, default=None, - help="Filtering by date range from this date.", + help="Filtering by date range from this date. Format: YYYYmmdd", ) @click.option( "--to", "timestamp_to", type=click.UNPROCESSED, + metavar="DATE", callback=parse_date, default=None, - help="Filtering by date range up to this date.", + help="Filtering by date range up to this date. Format: YYYYmmdd", ) @click.option("--limit", type=int, default=None, help="Query result limits.") @click.option( @@ -47,21 +49,21 @@ from waybacktweets.utils.utils import parse_date ) def cli( username: str, - unique: bool, + collapse: Optional[str], timestamp_from: Optional[str], timestamp_to: Optional[str], limit: Optional[int], offset: Optional[int], ) -> None: """ - Retrieves archived tweets' CDX data from the Wayback Machine, + Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data. USERNAME: The Twitter username without @. """ try: api = WaybackTweets( - username, unique, timestamp_from, timestamp_to, limit, offset + username, collapse, timestamp_from, timestamp_to, limit, offset ) archived_tweets = api.get() @@ -91,7 +93,6 @@ def cli( exporter.save_to_csv() exporter.save_to_json() exporter.save_to_html() - except exceptions as e: rprint(f"[red]{e}") finally: diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py index 7c4d1ca..eff374a 100644 --- a/waybacktweets/utils/utils.py +++ b/waybacktweets/utils/utils.py @@ -5,6 +5,7 @@ Helper functions. import re from datetime import datetime +import click import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry @@ -137,8 +138,12 @@ def parse_date(ctx=None, param=None, value=None): str: The input date string formatted in the "YYYYMMDD" format, or None if no date string was provided. """ - if value is None: - return None + try: + if value is None: + return None - date = datetime.strptime(value, "%Y%m%d") - return date.strftime("%Y%m%d") + date = datetime.strptime(value, "%Y%m%d") + + return date.strftime("%Y%m%d") + except ValueError: + raise click.BadParameter("Date must be in format YYYYmmdd")