@st.cache_data(ttl=1800, show_spinner=False)
def tweets_count(username, archived_timestamp_filter):
- url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501
+ url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501
try:
response = get_response(url=url)
)
st.caption(
- "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
+ "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
)
st.write(f"**{st.session_state.count} URLs have been captured**")
# Tweet Listing Processing
+ collapse = None
+ if unique:
+ collapse = "urlkey"
+
response = WaybackTweets(
username,
- unique,
+ collapse,
st.session_state.archived_timestamp_filter[0],
st.session_state.archived_timestamp_filter[1],
tweets_per_page,
--- /dev/null
+requests>=2.30.0
+streamlit==1.35.0
+waybacktweets>=1.0
-#cli #usage #wbt h3 {
+#cli #usage #wbt h3,
+.sphinxsidebarwrapper li ul li ul:has(a[href="#wbt"]):last-child{
display: none;
}
self.field_options = field_options
self.formatted_datetime = self._datetime_now()
self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
- self.dataframe = self._create_dataframe(self)
+ self.dataframe = self._create_dataframe()
@staticmethod
def _datetime_now():
class WaybackTweets:
"""Requests data from the Wayback CDX Server API and returns it in JSON format."""
- def __init__(self, username, unique, timestamp_from, timestamp_to, limit, offset):
+ def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset):
self.username = username
- self.unique = unique
+ self.collapse = collapse
self.timestamp_from = timestamp_from
self.timestamp_to = timestamp_to
self.limit = limit
"output": "json",
}
- if self.unique:
- params["collapse"] = "urlkey"
+ if self.collapse:
+ params["collapse"] = self.collapse
if self.timestamp_from:
params["from"] = self.timestamp_from
@click.command()
@click.argument("username", type=str)
@click.option(
- "--unique",
- type=bool,
- default=False,
- help="Only show unique URLs. Filtering by the collapse option using the urlkey field.", # noqa: E501
+ "--collapse",
+ type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
+ default=None,
+ help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501
)
@click.option(
"--from",
"timestamp_from",
type=click.UNPROCESSED,
+ metavar="DATE",
callback=parse_date,
default=None,
- help="Filtering by date range from this date.",
+ help="Filtering by date range from this date. Format: YYYYmmdd",
)
@click.option(
"--to",
"timestamp_to",
type=click.UNPROCESSED,
+ metavar="DATE",
callback=parse_date,
default=None,
- help="Filtering by date range up to this date.",
+ help="Filtering by date range up to this date. Format: YYYYmmdd",
)
@click.option("--limit", type=int, default=None, help="Query result limits.")
@click.option(
)
def cli(
username: str,
- unique: bool,
+ collapse: Optional[str],
timestamp_from: Optional[str],
timestamp_to: Optional[str],
limit: Optional[int],
offset: Optional[int],
) -> None:
"""
- Retrieves archived tweets' CDX data from the Wayback Machine,
+ Retrieves archived tweets CDX data from the Wayback Machine,
performs necessary parsing, and saves the data.
USERNAME: The Twitter username without @.
"""
try:
api = WaybackTweets(
- username, unique, timestamp_from, timestamp_to, limit, offset
+ username, collapse, timestamp_from, timestamp_to, limit, offset
)
archived_tweets = api.get()
exporter.save_to_csv()
exporter.save_to_json()
exporter.save_to_html()
-
except exceptions as e:
rprint(f"[red]{e}")
finally:
import re
from datetime import datetime
+import click
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
str: The input date string formatted in the "YYYYMMDD" format,
or None if no date string was provided.
"""
- if value is None:
- return None
+ try:
+ if value is None:
+ return None
- date = datetime.strptime(value, "%Y%m%d")
- return date.strftime("%Y%m%d")
+ date = datetime.strptime(value, "%Y%m%d")
+
+ return date.strftime("%Y%m%d")
+ except ValueError:
+ raise click.BadParameter("Date must be in format YYYYmmdd")