add verbose option, delete log option, review exceptions, update docs, add global...
authorClaromes <claromes@hey.com>
Mon, 17 Jun 2024 01:27:51 +0000 (22:27 -0300)
committerClaromes <claromes@hey.com>
Mon, 17 Jun 2024 01:27:51 +0000 (22:27 -0300)
15 files changed:
app/app.py
docs/api.rst
docs/exceptions.rst
waybacktweets/_cli.py
waybacktweets/api/parse_tweets.py
waybacktweets/api/request_tweets.py
waybacktweets/config/__init__.py [new file with mode: 0644]
waybacktweets/config/__pycache__/__init__.cpython-311.pyc [new file with mode: 0644]
waybacktweets/config/__pycache__/config.cpython-311.pyc [new file with mode: 0644]
waybacktweets/config/config.py [new file with mode: 0644]
waybacktweets/exceptions/__init__.py [new file with mode: 0644]
waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc [new file with mode: 0644]
waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc [new file with mode: 0644]
waybacktweets/exceptions/exceptions.py [new file with mode: 0644]
waybacktweets/utils/utils.py

index 87d630fef0f7f22930e8020d816df5161941583d..b1db36b5fcb7c3ef15e4ecc47595f0cb3084bc3a 100644 (file)
@@ -6,6 +6,12 @@ import streamlit.components.v1 as components
 from waybacktweets.api.export_tweets import TweetsExporter
 from waybacktweets.api.parse_tweets import JsonParser, TweetsParser
 from waybacktweets.api.request_tweets import WaybackTweets
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+    ConnectionError,
+    EmptyResponseError,
+    ReadTimeoutError,
+)
 from waybacktweets.utils.utils import (
     check_double_status,
     get_response,
@@ -93,6 +99,11 @@ if "archived_timestamp_filter" not in st.session_state:
     st.session_state.archived_timestamp_filter = (start_date, end_date)
 
 
+# Verbose mode configuration
+
+config.verbose = False
+
+
 # Pagination Settings
 
 
@@ -128,23 +139,24 @@ def next_page():
 def tweets_count(username, archived_timestamp_filter):
     url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
 
-    response, error, error_type = get_response(url=url)
-
-    if response.status_code == 200:
-        data = response.json()
-        if data and len(data) > 1:
-            total_tweets = len(data) - 1
-            return total_tweets
-        else:
-            return 0
-    elif error and error_type == "ReadTimeout":
-        st.error("Failed to establish a new connection with web.archive.org.")
+    try:
+        response = get_response(url=url)
+
+        if response.status_code == 200:
+            data = response.json()
+            if data and len(data) > 1:
+                total_tweets = len(data) - 1
+                return total_tweets
+            else:
+                return 0
+    except ReadTimeoutError:
+        st.error("Connection to web.archive.org timed out.")
         st.stop()
-    elif error and error_type == "ConnectionError":
+    except ConnectionError:
         st.error("Failed to establish a new connection with web.archive.org.")
         st.stop()
-    elif error and error_type:
-        st.error(f"{error}")
+    except EmptyResponseError:
+        st.error("No data was saved due to an empty response.")
         st.stop()
 
 
index 12537b95c97047a4dc756eb380f4117c9289c105..ee5ad43b08f811501b200fb5d29a98776d4db948 100644 (file)
@@ -1,53 +1,71 @@
 API
 ====
 
-Request
----------
-
-.. module:: waybacktweets.api.request_tweets
+Config
+------------
 
-.. autoclass:: WaybackTweets
+.. automodule:: waybacktweets.config.config
     :members:
 
 
+Exceptions
+------------
 
-Parse
----------
+.. automodule:: waybacktweets.exceptions.exceptions
 
-.. module:: waybacktweets.api.parse_tweets
+.. autoclass:: ReadTimeoutError
+    :members:
 
-.. autoclass:: TweetsParser
+.. autoclass:: ConnectionError
     :members:
 
-.. autoclass:: TwitterEmbed
+.. autoclass:: HTTPError
     :members:
 
-.. autoclass:: JsonParser
+.. autoclass:: EmptyResponseError
+    :members:
+
+.. autoclass:: GetResponseError
     :members:
 
 
 Export
 ---------
 
-.. module:: waybacktweets.api.export_tweets
+.. automodule:: waybacktweets.api.export_tweets
 
 .. autoclass:: TweetsExporter
     :members:
 
 
-Visualizer
------------
+Parse
+---------
 
-.. module:: waybacktweets.api.viz_tweets
+.. automodule:: waybacktweets.api.parse_tweets
 
-.. autoclass:: HTMLTweetsVisualizer
+.. autoclass:: TweetsParser
+    :members:
+
+.. autoclass:: TwitterEmbed
+    :members:
+
+.. autoclass:: JsonParser
+    :members:
+
+
+Request
+---------
+
+.. automodule:: waybacktweets.api.request_tweets
+
+.. autoclass:: WaybackTweets
     :members:
 
 
 Utils
 -------
 
-.. module:: waybacktweets.utils.utils
+.. automodule:: waybacktweets.utils.utils
 
 .. autofunction:: check_double_status
 .. autofunction:: check_pattern_tweet
@@ -57,3 +75,12 @@ Utils
 .. autofunction:: get_response
 .. autofunction:: is_tweet_url
 .. autofunction:: semicolon_parser
+
+
+Visualizer
+-----------
+
+.. automodule:: waybacktweets.api.viz_tweets
+
+.. autoclass:: HTMLTweetsVisualizer
+    :members:
index 109e41b91b03dfbbc732f5ed09dc5a4a16c9dc7f..22f0f3f309896fd442e3111c988e0b619eac0276 100644 (file)
@@ -3,7 +3,7 @@ Exceptions
 
 These are the most common errors and are handled by the ``waybacktweets`` package.
 
-ReadTimeout
+ReadTimeoutError
 ----------------
 
 This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues.
@@ -29,4 +29,9 @@ This error occurs when the Internet Archive services are temporarily offline. Th
 
 The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.``
 
+EmptyResponseError
+----------------------
 
+This exception raised for empty responses.
+
+The output message from the package would be: ``No data was saved due to an empty response.``
index 753b8f58b5b862589539121d89ac659e7a25f6e7..74fb1401cef318dca4e2e7f03547fe9e1179db23 100644 (file)
@@ -11,6 +11,7 @@ from rich import print as rprint
 from waybacktweets.api.export_tweets import TweetsExporter
 from waybacktweets.api.parse_tweets import TweetsParser
 from waybacktweets.api.request_tweets import WaybackTweets
+from waybacktweets.config.config import config
 
 
 def parse_date(
@@ -40,12 +41,14 @@ def parse_date(
 @click.command()
 @click.argument("username", type=str)
 @click.option(
+    "-c",
     "--collapse",
     type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
     default=None,
     help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.",  # noqa: E501
 )
 @click.option(
+    "-f",
     "--from",
     "timestamp_from",
     type=click.UNPROCESSED,
@@ -55,6 +58,7 @@ def parse_date(
     help="Filtering by date range from this date. Format: YYYYmmdd",
 )
 @click.option(
+    "-t",
     "--to",
     "timestamp_to",
     type=click.UNPROCESSED,
@@ -64,9 +68,15 @@ def parse_date(
     help="Filtering by date range up to this date. Format: YYYYmmdd",
 )
 @click.option(
-    "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits."
+    "-l",
+    "--limit",
+    type=int,
+    metavar="INTEGER",
+    default=None,
+    help="Query result limits.",
 )
 @click.option(
+    "-o",
     "--offset",
     type=int,
     metavar="INTEGER",
@@ -74,11 +84,20 @@ def parse_date(
     help="Allows for a simple way to scroll through the results.",
 )
 @click.option(
+    "-mt",
     "--matchtype",
     type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
     default=None,
     help="Results matching a certain prefix, a certain host or all subdomains.",  # noqa: E501
 )
+@click.option(
+    "-v",
+    "--verbose",
+    "verbose",
+    is_flag=True,
+    default=False,
+    help="Shows the error log.",
+)
 def main(
     username: str,
     collapse: Optional[str],
@@ -87,6 +106,7 @@ def main(
     limit: Optional[int],
     offset: Optional[int],
     matchtype: Optional[str],
+    verbose: Optional[bool],
 ) -> None:
     """
     Retrieves archived tweets CDX data from the Wayback Machine,
@@ -95,6 +115,8 @@ def main(
     USERNAME: The Twitter username without @.
     """
     try:
+        config.verbose = verbose
+
         api = WaybackTweets(
             username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
         )
index 585aec27a99fbcfa331d3341e6844b3f4803ab11..0b6c8d501756073bbca607abda1ce307ceb0c0fd 100644 (file)
@@ -11,6 +11,12 @@ from urllib.parse import unquote
 from rich import print as rprint
 from rich.progress import Progress
 
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+    ConnectionError,
+    GetResponseError,
+    HTTPError,
+)
 from waybacktweets.utils.utils import (
     check_double_status,
     check_pattern_tweet,
@@ -50,53 +56,56 @@ class TwitterEmbed:
             availability statuses, and URLs, respectively. If no tweets are available,
             returns None.
         """
-        url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
-        response, error, error_type = get_response(url=url)
-
-        if response:
-            json_response = response.json()
-            html = json_response["html"]
-            author_name = json_response["author_name"]
+        try:
+            url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
+            response = get_response(url=url)
+            if response:
+                json_response = response.json()
+                html = json_response["html"]
+                author_name = json_response["author_name"]
+
+                regex = re.compile(
+                    r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>',  # noqa
+                    re.DOTALL,
+                )
+                regex_author = re.compile(r"^(.*?)\s*\(")
+
+                matches_html = regex.findall(html)
+
+                tweet_content = []
+                user_info = []
+                is_RT = []
+
+                for match in matches_html:
+                    tweet_content_match = re.sub(
+                        r"<a[^>]*>|<\/a>", "", match[0].strip()
+                    ).replace("<br>", "\n")
+                    user_info_match = re.sub(
+                        r"<a[^>]*>|<\/a>", "", match[1].strip()
+                    ).replace(")", "), ")
+                    match_author = regex_author.search(user_info_match)
+                    author_tweet = match_author.group(1) if match_author else ""
+
+                    if tweet_content_match:
+                        tweet_content.append(tweet_content_match)
+                    if user_info_match:
+                        user_info.append(user_info_match)
+                        is_RT.append(author_name != author_tweet)
+
+                return tweet_content, is_RT, user_info
+        except ConnectionError:
+            if config.verbose:
+                rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
+        except HTTPError:
+            if config.verbose:
+                rprint(
+                    f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved."  # noqa: E501
+                )
+        except GetResponseError as e:
+            if config.verbose:
+                rprint(f"[red]An error occurred: {str(e)}")
 
-            regex = re.compile(
-                r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>',  # noqa
-                re.DOTALL,
-            )
-            regex_author = re.compile(r"^(.*?)\s*\(")
-
-            matches_html = regex.findall(html)
-
-            tweet_content = []
-            user_info = []
-            is_RT = []
-
-            for match in matches_html:
-                tweet_content_match = re.sub(
-                    r"<a[^>]*>|<\/a>", "", match[0].strip()
-                ).replace("<br>", "\n")
-                user_info_match = re.sub(
-                    r"<a[^>]*>|<\/a>", "", match[1].strip()
-                ).replace(")", "), ")
-                match_author = regex_author.search(user_info_match)
-                author_tweet = match_author.group(1) if match_author else ""
-
-                if tweet_content_match:
-                    tweet_content.append(tweet_content_match)
-                if user_info_match:
-                    user_info.append(user_info_match)
-                    is_RT.append(author_name != author_tweet)
-
-            return tweet_content, is_RT, user_info
-        elif error and error_type == "ConnectionError":
-            rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
-        elif error and error_type == "HTTPError":
-            rprint(
-                f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved."  # noqa: E501
-            )
-            return None
-        elif error and error_type:
-            rprint(f"[red]{error}")
-            return None
+        return None
 
 
 # TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
@@ -118,28 +127,31 @@ class JsonParser:
 
         :returns: The parsed tweet text.
         """
-        response, error, error_type = get_response(url=self.archived_tweet_url)
+        try:
+            response = get_response(url=self.archived_tweet_url)
+
+            if response:
+                json_data = response.json()
 
-        if response:
-            json_data = response.json()
+                if "data" in json_data:
+                    return json_data["data"].get("text", json_data["data"])
 
-            if "data" in json_data:
-                return json_data["data"].get("text", json_data["data"])
+                if "retweeted_status" in json_data:
+                    return json_data["retweeted_status"].get(
+                        "text", json_data["retweeted_status"]
+                    )
 
-            if "retweeted_status" in json_data:
-                return json_data["retweeted_status"].get(
-                    "text", json_data["retweeted_status"]
+                return json_data.get("text", json_data)
+        except ConnectionError:
+            if config.verbose:
+                rprint(
+                    f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
                 )
+        except GetResponseError as e:
+            if config.verbose:
+                rprint(f"[red]An error occurred: {str(e)}")
 
-            return json_data.get("text", json_data)
-        elif error and error_type == "ConnectionError":
-            rprint(
-                f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
-            )
-            return None
-        elif error and error_type:
-            rprint(f"[red]{error}")
-            return None
+        return None
 
 
 class TweetsParser:
@@ -252,6 +264,7 @@ class TweetsParser:
         Parses the archived tweets CDX data and structures it.
 
         :param print_progress: A boolean indicating whether to print progress or not.
+
         :returns: The parsed tweets data.
         """
         with ThreadPoolExecutor(max_workers=10) as executor:
index a215b6a3ae5b2a5705bcb9c4d2882b5ea32d61e2..d7d37a149f998f4165f29e995c3ae965c0ba5cad 100644 (file)
@@ -6,6 +6,14 @@ from typing import Any, Dict, Optional
 
 from rich import print as rprint
 
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+    ConnectionError,
+    EmptyResponseError,
+    GetResponseError,
+    HTTPError,
+    ReadTimeoutError,
+)
 from waybacktweets.utils.utils import get_response
 
 
@@ -76,19 +84,25 @@ class WaybackTweets:
         if self.matchtype:
             params["matchType"] = self.matchtype
 
-        response, error, error_type = get_response(url=url, params=params)
-
-        if response:
+        try:
+            response = get_response(url=url, params=params)
             return response.json()
-        elif error and error_type == "ReadTimeout":
-            rprint("[red]Connection to web.archive.org timed out.")
-        elif error and error_type == "ConnectionError":
-            rprint(
-                "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
-            )
-        elif error and error_type == "HTTPError":
-            rprint("[red]Connection to web.archive.org timed out.")
-        elif error and error_type:
-            rprint(
-                "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information."  # noqa: E501
-            )
+        except ReadTimeoutError:
+            if config.verbose:
+                rprint("[red]Connection to web.archive.org timed out.")
+        except ConnectionError:
+            if config.verbose:
+                rprint(
+                    "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
+                )
+        except HTTPError as e:
+            if config.verbose:
+                rprint(f"[red]HTTP error occurred: {str(e)}")
+        except EmptyResponseError:
+            if config.verbose:
+                rprint("[red]No data was saved due to an empty response.")
+        except GetResponseError as e:
+            if config.verbose:
+                rprint(f"[red]An error occurred: {str(e)}")
+
+        return None
diff --git a/waybacktweets/config/__init__.py b/waybacktweets/config/__init__.py
new file mode 100644 (file)
index 0000000..457fa4f
--- /dev/null
@@ -0,0 +1,3 @@
+# flake8: noqa: F401
+
+from waybacktweets.config.config import config
diff --git a/waybacktweets/config/__pycache__/__init__.cpython-311.pyc b/waybacktweets/config/__pycache__/__init__.cpython-311.pyc
new file mode 100644 (file)
index 0000000..9b453bc
Binary files /dev/null and b/waybacktweets/config/__pycache__/__init__.cpython-311.pyc differ
diff --git a/waybacktweets/config/__pycache__/config.cpython-311.pyc b/waybacktweets/config/__pycache__/config.cpython-311.pyc
new file mode 100644 (file)
index 0000000..2bf4595
Binary files /dev/null and b/waybacktweets/config/__pycache__/config.cpython-311.pyc differ
diff --git a/waybacktweets/config/config.py b/waybacktweets/config/config.py
new file mode 100644 (file)
index 0000000..eb6e6dd
--- /dev/null
@@ -0,0 +1,18 @@
+"""
+Manages global configuration settings throughout the application.
+"""
+
+
+class _Config:
+    def __init__(self, verbose: bool = True):
+        self.verbose = verbose
+
+
+config = _Config()
+"""
+Configuration settings..
+
+.. attribute:: verbose
+
+    Determines if verbose logging should be enabled.
+"""
diff --git a/waybacktweets/exceptions/__init__.py b/waybacktweets/exceptions/__init__.py
new file mode 100644 (file)
index 0000000..ee01628
--- /dev/null
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+
+from waybacktweets.exceptions.exceptions import (
+    ConnectionError,
+    EmptyResponseError,
+    GetResponseError,
+    HTTPError,
+    ReadTimeoutError,
+)
diff --git a/waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc b/waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc
new file mode 100644 (file)
index 0000000..8d74104
Binary files /dev/null and b/waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc differ
diff --git a/waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc b/waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc
new file mode 100644 (file)
index 0000000..5ade7da
Binary files /dev/null and b/waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc differ
diff --git a/waybacktweets/exceptions/exceptions.py b/waybacktweets/exceptions/exceptions.py
new file mode 100644 (file)
index 0000000..383fb80
--- /dev/null
@@ -0,0 +1,23 @@
+"""
+Wayback Tweets Exceptions
+"""
+
+
+class GetResponseError(Exception):
+    """Base class for exceptions in get_response."""
+
+
+class ReadTimeoutError(GetResponseError):
+    """Exception raised for read timeout errors."""
+
+
+class ConnectionError(GetResponseError):
+    """Exception raised for connection errors."""
+
+
+class HTTPError(GetResponseError):
+    """Exception raised for HTTP errors."""
+
+
+class EmptyResponseError(GetResponseError):
+    """Exception raised for empty responses."""
index 26a1d88d9224adb7541f3df75ad79df5f66dcb45..326143455d68396c0eb050a83a3cfaf571e4b470 100644 (file)
@@ -9,19 +9,30 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
+from waybacktweets.exceptions.exceptions import (
+    ConnectionError,
+    EmptyResponseError,
+    GetResponseError,
+    HTTPError,
+    ReadTimeoutError,
+)
+
 
 def get_response(
     url: str, params: Optional[dict] = None
 ) -> Tuple[Optional[requests.Response], Optional[str], Optional[str]]:
     """
-    Sends a GET request to the specified URL and returns the response,
-    an error message if any, and the type of exception if any.
+    Sends a GET request to the specified URL and returns the response.
 
     :param url: The URL to send the GET request to.
     :param params: The parameters to include in the GET request.
 
-    :returns: A tuple containing the response from the server or None,
-              an error message or None, and the type of exception or None.
+    :returns: The response from the server.
+
+    :raises ReadTimeoutError: If a read timeout occurs.
+    :raises ConnectionError: If a connection error occurs.
+    :raises HTTPError: If an HTTP error occurs.
+    :raises EmptyResponseError: If the response is empty.
     """
     session = requests.Session()
     retry = Retry(connect=3, backoff_factor=0.3)
@@ -38,12 +49,16 @@ def get_response(
         response.raise_for_status()
 
         if not response or response.json() == []:
-            return None, "No data was saved due to an empty response.", None
-        return response, None, None
-    except requests.exceptions.RequestException as e:
-        return None, str(e), type(e).__name__
-    except Exception as e:
-        return None, str(e), type(e).__name__
+            raise EmptyResponseError("No data was saved due to an empty response.")
+        return response
+    except requests.exceptions.ReadTimeout:
+        raise ReadTimeoutError
+    except requests.exceptions.ConnectionError:
+        raise ConnectionError
+    except requests.exceptions.HTTPError:
+        raise HTTPError
+    except requests.exceptions.RequestException:
+        raise GetResponseError
 
 
 def clean_tweet_url(tweet_url: str, username: str) -> str: