from waybacktweets.api.export_tweets import TweetsExporter
from waybacktweets.api.parse_tweets import JsonParser, TweetsParser
from waybacktweets.api.request_tweets import WaybackTweets
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ EmptyResponseError,
+ ReadTimeoutError,
+)
from waybacktweets.utils.utils import (
check_double_status,
get_response,
st.session_state.archived_timestamp_filter = (start_date, end_date)
+# Verbose mode configuration
+
+config.verbose = False
+
+
# Pagination Settings
def tweets_count(username, archived_timestamp_filter):
url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501
- response, error, error_type = get_response(url=url)
-
- if response.status_code == 200:
- data = response.json()
- if data and len(data) > 1:
- total_tweets = len(data) - 1
- return total_tweets
- else:
- return 0
- elif error and error_type == "ReadTimeout":
- st.error("Failed to establish a new connection with web.archive.org.")
+ try:
+ response = get_response(url=url)
+
+ if response.status_code == 200:
+ data = response.json()
+ if data and len(data) > 1:
+ total_tweets = len(data) - 1
+ return total_tweets
+ else:
+ return 0
+ except ReadTimeoutError:
+ st.error("Connection to web.archive.org timed out.")
st.stop()
- elif error and error_type == "ConnectionError":
+ except ConnectionError:
st.error("Failed to establish a new connection with web.archive.org.")
st.stop()
- elif error and error_type:
- st.error(f"{error}")
+ except EmptyResponseError:
+ st.error("No data was saved due to an empty response.")
st.stop()
API
====
-Request
----------
-
-.. module:: waybacktweets.api.request_tweets
+Config
+------------
-.. autoclass:: WaybackTweets
+.. automodule:: waybacktweets.config.config
:members:
+Exceptions
+------------
-Parse
----------
+.. automodule:: waybacktweets.exceptions.exceptions
-.. module:: waybacktweets.api.parse_tweets
+.. autoclass:: ReadTimeoutError
+ :members:
-.. autoclass:: TweetsParser
+.. autoclass:: ConnectionError
:members:
-.. autoclass:: TwitterEmbed
+.. autoclass:: HTTPError
:members:
-.. autoclass:: JsonParser
+.. autoclass:: EmptyResponseError
+ :members:
+
+.. autoclass:: GetResponseError
:members:
Export
---------
-.. module:: waybacktweets.api.export_tweets
+.. automodule:: waybacktweets.api.export_tweets
.. autoclass:: TweetsExporter
:members:
-Visualizer
------------
+Parse
+---------
-.. module:: waybacktweets.api.viz_tweets
+.. automodule:: waybacktweets.api.parse_tweets
-.. autoclass:: HTMLTweetsVisualizer
+.. autoclass:: TweetsParser
+ :members:
+
+.. autoclass:: TwitterEmbed
+ :members:
+
+.. autoclass:: JsonParser
+ :members:
+
+
+Request
+---------
+
+.. automodule:: waybacktweets.api.request_tweets
+
+.. autoclass:: WaybackTweets
:members:
Utils
-------
-.. module:: waybacktweets.utils.utils
+.. automodule:: waybacktweets.utils.utils
.. autofunction:: check_double_status
.. autofunction:: check_pattern_tweet
.. autofunction:: get_response
.. autofunction:: is_tweet_url
.. autofunction:: semicolon_parser
+
+
+Visualizer
+-----------
+
+.. automodule:: waybacktweets.api.viz_tweets
+
+.. autoclass:: HTMLTweetsVisualizer
+ :members:
These are the most common errors and are handled by the ``waybacktweets`` package.
-ReadTimeout
+ReadTimeoutError
----------------
This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues.
The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.``
+EmptyResponseError
+----------------------
+This exception raised for empty responses.
+
+The output message from the package would be: ``No data was saved due to an empty response.``
from waybacktweets.api.export_tweets import TweetsExporter
from waybacktweets.api.parse_tweets import TweetsParser
from waybacktweets.api.request_tweets import WaybackTweets
+from waybacktweets.config.config import config
def parse_date(
@click.command()
@click.argument("username", type=str)
@click.option(
+ "-c",
"--collapse",
type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
default=None,
help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501
)
@click.option(
+ "-f",
"--from",
"timestamp_from",
type=click.UNPROCESSED,
help="Filtering by date range from this date. Format: YYYYmmdd",
)
@click.option(
+ "-t",
"--to",
"timestamp_to",
type=click.UNPROCESSED,
help="Filtering by date range up to this date. Format: YYYYmmdd",
)
@click.option(
- "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits."
+ "-l",
+ "--limit",
+ type=int,
+ metavar="INTEGER",
+ default=None,
+ help="Query result limits.",
)
@click.option(
+ "-o",
"--offset",
type=int,
metavar="INTEGER",
help="Allows for a simple way to scroll through the results.",
)
@click.option(
+ "-mt",
"--matchtype",
type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
default=None,
help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501
)
+@click.option(
+ "-v",
+ "--verbose",
+ "verbose",
+ is_flag=True,
+ default=False,
+ help="Shows the error log.",
+)
def main(
username: str,
collapse: Optional[str],
limit: Optional[int],
offset: Optional[int],
matchtype: Optional[str],
+ verbose: Optional[bool],
) -> None:
"""
Retrieves archived tweets CDX data from the Wayback Machine,
USERNAME: The Twitter username without @.
"""
try:
+ config.verbose = verbose
+
api = WaybackTweets(
username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
)
from rich import print as rprint
from rich.progress import Progress
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ GetResponseError,
+ HTTPError,
+)
from waybacktweets.utils.utils import (
check_double_status,
check_pattern_tweet,
availability statuses, and URLs, respectively. If no tweets are available,
returns None.
"""
- url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
- response, error, error_type = get_response(url=url)
-
- if response:
- json_response = response.json()
- html = json_response["html"]
- author_name = json_response["author_name"]
+ try:
+ url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
+ response = get_response(url=url)
+ if response:
+ json_response = response.json()
+ html = json_response["html"]
+ author_name = json_response["author_name"]
+
+ regex = re.compile(
+ r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa
+ re.DOTALL,
+ )
+ regex_author = re.compile(r"^(.*?)\s*\(")
+
+ matches_html = regex.findall(html)
+
+ tweet_content = []
+ user_info = []
+ is_RT = []
+
+ for match in matches_html:
+ tweet_content_match = re.sub(
+ r"<a[^>]*>|<\/a>", "", match[0].strip()
+ ).replace("<br>", "\n")
+ user_info_match = re.sub(
+ r"<a[^>]*>|<\/a>", "", match[1].strip()
+ ).replace(")", "), ")
+ match_author = regex_author.search(user_info_match)
+ author_tweet = match_author.group(1) if match_author else ""
+
+ if tweet_content_match:
+ tweet_content.append(tweet_content_match)
+ if user_info_match:
+ user_info.append(user_info_match)
+ is_RT.append(author_name != author_tweet)
+
+ return tweet_content, is_RT, user_info
+ except ConnectionError:
+ if config.verbose:
+ rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
+ except HTTPError:
+ if config.verbose:
+ rprint(
+ f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501
+ )
+ except GetResponseError as e:
+ if config.verbose:
+ rprint(f"[red]An error occurred: {str(e)}")
- regex = re.compile(
- r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa
- re.DOTALL,
- )
- regex_author = re.compile(r"^(.*?)\s*\(")
-
- matches_html = regex.findall(html)
-
- tweet_content = []
- user_info = []
- is_RT = []
-
- for match in matches_html:
- tweet_content_match = re.sub(
- r"<a[^>]*>|<\/a>", "", match[0].strip()
- ).replace("<br>", "\n")
- user_info_match = re.sub(
- r"<a[^>]*>|<\/a>", "", match[1].strip()
- ).replace(")", "), ")
- match_author = regex_author.search(user_info_match)
- author_tweet = match_author.group(1) if match_author else ""
-
- if tweet_content_match:
- tweet_content.append(tweet_content_match)
- if user_info_match:
- user_info.append(user_info_match)
- is_RT.append(author_name != author_tweet)
-
- return tweet_content, is_RT, user_info
- elif error and error_type == "ConnectionError":
- rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
- elif error and error_type == "HTTPError":
- rprint(
- f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501
- )
- return None
- elif error and error_type:
- rprint(f"[red]{error}")
- return None
+ return None
# TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
:returns: The parsed tweet text.
"""
- response, error, error_type = get_response(url=self.archived_tweet_url)
+ try:
+ response = get_response(url=self.archived_tweet_url)
+
+ if response:
+ json_data = response.json()
- if response:
- json_data = response.json()
+ if "data" in json_data:
+ return json_data["data"].get("text", json_data["data"])
- if "data" in json_data:
- return json_data["data"].get("text", json_data["data"])
+ if "retweeted_status" in json_data:
+ return json_data["retweeted_status"].get(
+ "text", json_data["retweeted_status"]
+ )
- if "retweeted_status" in json_data:
- return json_data["retweeted_status"].get(
- "text", json_data["retweeted_status"]
+ return json_data.get("text", json_data)
+ except ConnectionError:
+ if config.verbose:
+ rprint(
+ f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
)
+ except GetResponseError as e:
+ if config.verbose:
+ rprint(f"[red]An error occurred: {str(e)}")
- return json_data.get("text", json_data)
- elif error and error_type == "ConnectionError":
- rprint(
- f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
- )
- return None
- elif error and error_type:
- rprint(f"[red]{error}")
- return None
+ return None
class TweetsParser:
Parses the archived tweets CDX data and structures it.
:param print_progress: A boolean indicating whether to print progress or not.
+
:returns: The parsed tweets data.
"""
with ThreadPoolExecutor(max_workers=10) as executor:
from rich import print as rprint
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ EmptyResponseError,
+ GetResponseError,
+ HTTPError,
+ ReadTimeoutError,
+)
from waybacktweets.utils.utils import get_response
if self.matchtype:
params["matchType"] = self.matchtype
- response, error, error_type = get_response(url=url, params=params)
-
- if response:
+ try:
+ response = get_response(url=url, params=params)
return response.json()
- elif error and error_type == "ReadTimeout":
- rprint("[red]Connection to web.archive.org timed out.")
- elif error and error_type == "ConnectionError":
- rprint(
- "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
- )
- elif error and error_type == "HTTPError":
- rprint("[red]Connection to web.archive.org timed out.")
- elif error and error_type:
- rprint(
- "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501
- )
+ except ReadTimeoutError:
+ if config.verbose:
+ rprint("[red]Connection to web.archive.org timed out.")
+ except ConnectionError:
+ if config.verbose:
+ rprint(
+ "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
+ )
+ except HTTPError as e:
+ if config.verbose:
+ rprint(f"[red]HTTP error occurred: {str(e)}")
+ except EmptyResponseError:
+ if config.verbose:
+ rprint("[red]No data was saved due to an empty response.")
+ except GetResponseError as e:
+ if config.verbose:
+ rprint(f"[red]An error occurred: {str(e)}")
+
+ return None
--- /dev/null
+# flake8: noqa: F401
+
+from waybacktweets.config.config import config
--- /dev/null
+"""
+Manages global configuration settings throughout the application.
+"""
+
+
+class _Config:
+ def __init__(self, verbose: bool = True):
+ self.verbose = verbose
+
+
+config = _Config()
+"""
+Configuration settings..
+
+.. attribute:: verbose
+
+ Determines if verbose logging should be enabled.
+"""
--- /dev/null
+# flake8: noqa: F401
+
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ EmptyResponseError,
+ GetResponseError,
+ HTTPError,
+ ReadTimeoutError,
+)
--- /dev/null
+"""
+Wayback Tweets Exceptions
+"""
+
+
+class GetResponseError(Exception):
+ """Base class for exceptions in get_response."""
+
+
+class ReadTimeoutError(GetResponseError):
+ """Exception raised for read timeout errors."""
+
+
+class ConnectionError(GetResponseError):
+ """Exception raised for connection errors."""
+
+
+class HTTPError(GetResponseError):
+ """Exception raised for HTTP errors."""
+
+
+class EmptyResponseError(GetResponseError):
+ """Exception raised for empty responses."""
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ EmptyResponseError,
+ GetResponseError,
+ HTTPError,
+ ReadTimeoutError,
+)
+
def get_response(
url: str, params: Optional[dict] = None
) -> Tuple[Optional[requests.Response], Optional[str], Optional[str]]:
"""
- Sends a GET request to the specified URL and returns the response,
- an error message if any, and the type of exception if any.
+ Sends a GET request to the specified URL and returns the response.
:param url: The URL to send the GET request to.
:param params: The parameters to include in the GET request.
- :returns: A tuple containing the response from the server or None,
- an error message or None, and the type of exception or None.
+ :returns: The response from the server.
+
+ :raises ReadTimeoutError: If a read timeout occurs.
+ :raises ConnectionError: If a connection error occurs.
+ :raises HTTPError: If an HTTP error occurs.
+ :raises EmptyResponseError: If the response is empty.
"""
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.3)
response.raise_for_status()
if not response or response.json() == []:
- return None, "No data was saved due to an empty response.", None
- return response, None, None
- except requests.exceptions.RequestException as e:
- return None, str(e), type(e).__name__
- except Exception as e:
- return None, str(e), type(e).__name__
+ raise EmptyResponseError("No data was saved due to an empty response.")
+ return response
+ except requests.exceptions.ReadTimeout:
+ raise ReadTimeoutError
+ except requests.exceptions.ConnectionError:
+ raise ConnectionError
+ except requests.exceptions.HTTPError:
+ raise HTTPError
+ except requests.exceptions.RequestException:
+ raise GetResponseError
def clean_tweet_url(tweet_url: str, username: str) -> str: