From: Claromes Date: Mon, 17 Jun 2024 01:27:51 +0000 (-0300) Subject: add verbose option, delete log option, review exceptions, update docs, add global... X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=445cdad916f1a92e402ed82b02cc804ccaf99c5c;p=waybacktweets.git add verbose option, delete log option, review exceptions, update docs, add global config module --- diff --git a/app/app.py b/app/app.py index 87d630f..b1db36b 100644 --- a/app/app.py +++ b/app/app.py @@ -6,6 +6,12 @@ import streamlit.components.v1 as components from waybacktweets.api.export_tweets import TweetsExporter from waybacktweets.api.parse_tweets import JsonParser, TweetsParser from waybacktweets.api.request_tweets import WaybackTweets +from waybacktweets.config.config import config +from waybacktweets.exceptions.exceptions import ( + ConnectionError, + EmptyResponseError, + ReadTimeoutError, +) from waybacktweets.utils.utils import ( check_double_status, get_response, @@ -93,6 +99,11 @@ if "archived_timestamp_filter" not in st.session_state: st.session_state.archived_timestamp_filter = (start_date, end_date) +# Verbose mode configuration + +config.verbose = False + + # Pagination Settings @@ -128,23 +139,24 @@ def next_page(): def tweets_count(username, archived_timestamp_filter): url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 - response, error, error_type = get_response(url=url) - - if response.status_code == 200: - data = response.json() - if data and len(data) > 1: - total_tweets = len(data) - 1 - return total_tweets - else: - return 0 - elif error and error_type == "ReadTimeout": - st.error("Failed to establish a new connection with web.archive.org.") + try: + response = get_response(url=url) + + if response.status_code == 200: + data = response.json() + if data and len(data) > 1: + total_tweets = len(data) - 1 + return total_tweets + else: + return 0 + except ReadTimeoutError: + st.error("Connection to web.archive.org timed out.") st.stop() - elif error and error_type == "ConnectionError": + except ConnectionError: st.error("Failed to establish a new connection with web.archive.org.") st.stop() - elif error and error_type: - st.error(f"{error}") + except EmptyResponseError: + st.error("No data was saved due to an empty response.") st.stop() diff --git a/docs/api.rst b/docs/api.rst index 12537b9..ee5ad43 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,53 +1,71 @@ API ==== -Request ---------- - -.. module:: waybacktweets.api.request_tweets +Config +------------ -.. autoclass:: WaybackTweets +.. automodule:: waybacktweets.config.config :members: +Exceptions +------------ -Parse ---------- +.. automodule:: waybacktweets.exceptions.exceptions -.. module:: waybacktweets.api.parse_tweets +.. autoclass:: ReadTimeoutError + :members: -.. autoclass:: TweetsParser +.. autoclass:: ConnectionError :members: -.. autoclass:: TwitterEmbed +.. autoclass:: HTTPError :members: -.. autoclass:: JsonParser +.. autoclass:: EmptyResponseError + :members: + +.. autoclass:: GetResponseError :members: Export --------- -.. module:: waybacktweets.api.export_tweets +.. automodule:: waybacktweets.api.export_tweets .. autoclass:: TweetsExporter :members: -Visualizer ------------ +Parse +--------- -.. module:: waybacktweets.api.viz_tweets +.. automodule:: waybacktweets.api.parse_tweets -.. autoclass:: HTMLTweetsVisualizer +.. autoclass:: TweetsParser + :members: + +.. autoclass:: TwitterEmbed + :members: + +.. autoclass:: JsonParser + :members: + + +Request +--------- + +.. automodule:: waybacktweets.api.request_tweets + +.. autoclass:: WaybackTweets :members: Utils ------- -.. module:: waybacktweets.utils.utils +.. automodule:: waybacktweets.utils.utils .. autofunction:: check_double_status .. autofunction:: check_pattern_tweet @@ -57,3 +75,12 @@ Utils .. autofunction:: get_response .. autofunction:: is_tweet_url .. autofunction:: semicolon_parser + + +Visualizer +----------- + +.. automodule:: waybacktweets.api.viz_tweets + +.. autoclass:: HTMLTweetsVisualizer + :members: diff --git a/docs/exceptions.rst b/docs/exceptions.rst index 109e41b..22f0f3f 100644 --- a/docs/exceptions.rst +++ b/docs/exceptions.rst @@ -3,7 +3,7 @@ Exceptions These are the most common errors and are handled by the ``waybacktweets`` package. -ReadTimeout +ReadTimeoutError ---------------- This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues. @@ -29,4 +29,9 @@ This error occurs when the Internet Archive services are temporarily offline. Th The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.`` +EmptyResponseError +---------------------- +This exception raised for empty responses. + +The output message from the package would be: ``No data was saved due to an empty response.`` diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index 753b8f5..74fb140 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -11,6 +11,7 @@ from rich import print as rprint from waybacktweets.api.export_tweets import TweetsExporter from waybacktweets.api.parse_tweets import TweetsParser from waybacktweets.api.request_tweets import WaybackTweets +from waybacktweets.config.config import config def parse_date( @@ -40,12 +41,14 @@ def parse_date( @click.command() @click.argument("username", type=str) @click.option( + "-c", "--collapse", type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False), default=None, help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501 ) @click.option( + "-f", "--from", "timestamp_from", type=click.UNPROCESSED, @@ -55,6 +58,7 @@ def parse_date( help="Filtering by date range from this date. Format: YYYYmmdd", ) @click.option( + "-t", "--to", "timestamp_to", type=click.UNPROCESSED, @@ -64,9 +68,15 @@ def parse_date( help="Filtering by date range up to this date. Format: YYYYmmdd", ) @click.option( - "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits." + "-l", + "--limit", + type=int, + metavar="INTEGER", + default=None, + help="Query result limits.", ) @click.option( + "-o", "--offset", type=int, metavar="INTEGER", @@ -74,11 +84,20 @@ def parse_date( help="Allows for a simple way to scroll through the results.", ) @click.option( + "-mt", "--matchtype", type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False), default=None, help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501 ) +@click.option( + "-v", + "--verbose", + "verbose", + is_flag=True, + default=False, + help="Shows the error log.", +) def main( username: str, collapse: Optional[str], @@ -87,6 +106,7 @@ def main( limit: Optional[int], offset: Optional[int], matchtype: Optional[str], + verbose: Optional[bool], ) -> None: """ Retrieves archived tweets CDX data from the Wayback Machine, @@ -95,6 +115,8 @@ def main( USERNAME: The Twitter username without @. """ try: + config.verbose = verbose + api = WaybackTweets( username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype ) diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py index 585aec2..0b6c8d5 100644 --- a/waybacktweets/api/parse_tweets.py +++ b/waybacktweets/api/parse_tweets.py @@ -11,6 +11,12 @@ from urllib.parse import unquote from rich import print as rprint from rich.progress import Progress +from waybacktweets.config.config import config +from waybacktweets.exceptions.exceptions import ( + ConnectionError, + GetResponseError, + HTTPError, +) from waybacktweets.utils.utils import ( check_double_status, check_pattern_tweet, @@ -50,53 +56,56 @@ class TwitterEmbed: availability statuses, and URLs, respectively. If no tweets are available, returns None. """ - url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" - response, error, error_type = get_response(url=url) - - if response: - json_response = response.json() - html = json_response["html"] - author_name = json_response["author_name"] + try: + url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" + response = get_response(url=url) + if response: + json_response = response.json() + html = json_response["html"] + author_name = json_response["author_name"] + + regex = re.compile( + r'
]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa + re.DOTALL, + ) + regex_author = re.compile(r"^(.*?)\s*\(") + + matches_html = regex.findall(html) + + tweet_content = [] + user_info = [] + is_RT = [] + + for match in matches_html: + tweet_content_match = re.sub( + r"]*>|<\/a>", "", match[0].strip() + ).replace("
", "\n") + user_info_match = re.sub( + r"]*>|<\/a>", "", match[1].strip() + ).replace(")", "), ") + match_author = regex_author.search(user_info_match) + author_tweet = match_author.group(1) if match_author else "" + + if tweet_content_match: + tweet_content.append(tweet_content_match) + if user_info_match: + user_info.append(user_info_match) + is_RT.append(author_name != author_tweet) + + return tweet_content, is_RT, user_info + except ConnectionError: + if config.verbose: + rprint("[yellow]Error parsing the tweet, but the CDX data was saved.") + except HTTPError: + if config.verbose: + rprint( + f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501 + ) + except GetResponseError as e: + if config.verbose: + rprint(f"[red]An error occurred: {str(e)}") - regex = re.compile( - r'