update docs, review exceptions

author Claromes <claromes@hey.com>

Sun, 16 Jun 2024 08:55:01 +0000 (05:55 -0300)

committer Claromes <claromes@hey.com>

Sun, 16 Jun 2024 08:55:01 +0000 (05:55 -0300)
author Claromes <claromes@hey.com>
Sun, 16 Jun 2024 08:55:01 +0000 (05:55 -0300)
committer Claromes <claromes@hey.com>
Sun, 16 Jun 2024 08:55:01 +0000 (05:55 -0300)
diff --git a/README.md b/README.md

index 5a8890f6a1f59f9d5a954306a2d0b8f732a505fd..1ddd1b3a7aa460474bdd6e63c1bb59176688e127 100644 (file)
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ if archived_tweets:
  
  ### Using Wayback Tweets as a Web App
  
-[Access the application](https://waybacktweets.streamlit.app), a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
+[Open the application](https://waybacktweets.streamlit.app), a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
  
  ## Documentation
  
diff --git a/app/app.py b/app/app.py

index e0245da9598f1d0677c48e725f0b422a947ece3f..87d630fef0f7f22930e8020d816df5161941583d 100644 (file)
--- a/app/app.py
+++ b/app/app.py
@@ -1,6 +1,5 @@
  import datetime
  
-import requests
  import streamlit as st
  import streamlit.components.v1 as components
  
@@ -26,11 +25,11 @@ st.set_page_config(
          "About": f"""
      [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)
  
-    Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
+    Application that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
  
      The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html).
  
-    © Copyright 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library
+    © 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License
  
      ---
  """,  # noqa: E501
@@ -129,24 +128,23 @@ def next_page():
  def tweets_count(username, archived_timestamp_filter):
      url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
  
-    try:
-        response = get_response(url=url)
-
-        if response.status_code == 200:
-            data = response.json()
-            if data and len(data) > 1:
-                total_tweets = len(data) - 1
-                return total_tweets
-            else:
-                return 0
-    except requests.exceptions.ReadTimeout:
-        st.error("Connection to web.archive.org timed out.")
+    response, error, error_type = get_response(url=url)
+
+    if response.status_code == 200:
+        data = response.json()
+        if data and len(data) > 1:
+            total_tweets = len(data) - 1
+            return total_tweets
+        else:
+            return 0
+    elif error and error_type == "ReadTimeout":
+        st.error("Failed to establish a new connection with web.archive.org.")
          st.stop()
-    except requests.exceptions.ConnectionError:
+    elif error and error_type == "ConnectionError":
          st.error("Failed to establish a new connection with web.archive.org.")
          st.stop()
-    except Exception as e:
-        st.error(f"{e}")
+    elif error and error_type:
+        st.error(f"{error}")
          st.stop()
  
  
diff --git a/docs/conf.py b/docs/conf.py

index 643113f169ad6af0c7f52669ca584ab3c7572025..5692fea3d1e48a844153de237616c31da5464be7 100644 (file)
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,7 +4,7 @@ from pallets_sphinx_themes import ProjectLink, get_version
  
  project = "Wayback Tweets"
  release, version = get_version("waybacktweets")
-copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title Font by Google, licensed under the Open Font License · Wayback Tweets v{version}"  # noqa: E501
+copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License · Wayback Tweets v{version}"  # noqa: E501
  author = "Claromes"
  
  # -- General configuration ---------------------------------------------------
diff --git a/docs/errors.rst b/docs/errors.rst

deleted file mode 100644 (file)

index 38a8f1b..0000000
--- a/docs/errors.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Errors
-================
-
-These are the most common errors and are handled by the ``waybacktweets`` package.
-
-ReadTimeout
-----------------
-
-This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues.
-
-The output message from the package would be: ``Connection to web.archive.org timed out.``
-
-ConnectionError
-----------------
-
-This error is raised when the package fails to establish a new connection with web.archive.org. This could be due to network issues or the server being down.
-
-The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
-
-
-This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
-
-The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
-
-HTTPError
-----------------
-
-This error occurs when the Internet Archive services are temporarily offline. This could be due to maintenance or server issues.
-
-The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.``
-
-
diff --git a/docs/exceptions.rst b/docs/exceptions.rst

new file mode 100644 (file)

index 0000000..109e41b
--- /dev/null
+++ b/docs/exceptions.rst
@@ -0,0 +1,32 @@
+Exceptions
+================
+
+These are the most common errors and are handled by the ``waybacktweets`` package.
+
+ReadTimeout
+----------------
+
+This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues.
+
+The output message from the package would be: ``Connection to web.archive.org timed out.``
+
+ConnectionError
+----------------
+
+This error is raised when the package fails to establish a new connection with web.archive.org. This could be due to network issues or the server being down.
+
+The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
+
+
+This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
+
+The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
+
+HTTPError
+----------------
+
+This error occurs when the Internet Archive services are temporarily offline. This could be due to maintenance or server issues.
+
+The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.``
+
+
diff --git a/docs/index.rst b/docs/index.rst

index c6e5175ae97dff98a7ed012b05223dd14a44a4b0..9bf5bcd061f6209d9732c2ce6d768757ccb721cd 100644 (file)
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,7 +19,7 @@ User Guide
      quickstart
      workflow
      result
-    errors
+    exceptions
      contribute
      todo
  
diff --git a/docs/quickstart.rst b/docs/quickstart.rst

index f98a5031c71223332eae86909b1cdd501227d495..4e3c4d7e11b81aca8adc95c78fd39ca2c142106b 100644 (file)
--- a/docs/quickstart.rst
+++ b/docs/quickstart.rst
@@ -10,7 +10,7 @@ waybacktweets [OPTIONS] USERNAME
  
  .. code-block:: shell
  
-    waybacktweets --from 20150101 --to 20191231 --limit 250 jack`
+    waybacktweets --from 20150101 --to 20191231 --limit 250 jack
  
  
  Module
@@ -46,4 +46,4 @@ Web App
  
  Using Wayback Tweets as a Streamlit Web App
  
-`Access the application <https://waybacktweets.streamlit.app>`_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
+`Open the application <https://waybacktweets.streamlit.app>`_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
diff --git a/docs/streamlit.rst b/docs/streamlit.rst

index 78da8662272df4c7478109b796d268e55dfe4a22..b8de7d986ef1e7f3e12eaf3fdfb9d9f6ec673589 100644 (file)
--- a/docs/streamlit.rst
+++ b/docs/streamlit.rst
@@ -3,6 +3,8 @@ Web App
  
  Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually. The application is a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL.
  
+`Open the application <https://waybacktweets.streamlit.app>`_.
+
  
  Filters
  ----------
diff --git a/docs/workflow.rst b/docs/workflow.rst

index c3ffd321f672475400b94095c391a22488824b6e..2480b35c52ce0504c8ea0df8c8b56f6568599dc8 100644 (file)
--- a/docs/workflow.rst
+++ b/docs/workflow.rst
@@ -19,5 +19,5 @@ Use the mouse to zoom in and out the flowchart.
        C--> |4xx| E[return None]
        E--> F{request Archived\nTweet URL}
        F--> |4xx| G[return Only CDX data]
-      F--> |TODO: 2xx/3xx: application/json| J[return JSON text]
+      F--> |2xx/3xx: application/json| J[return JSON text]
        F--> |2xx/3xx: text/html, warc/revisit, unk| K[return HTML iframe tag]
diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py

index 4cd5d836526da5fddb9447e3614131a7c316dfa3..a6daf4149224468d08edc0bf4f8bf3cc6a1e00df 100644 (file)
--- a/waybacktweets/api/export_tweets.py
+++ b/waybacktweets/api/export_tweets.py
@@ -1,3 +1,7 @@
+"""
+Exports the parsed archived tweets.
+"""
+
  import datetime
  import os
  import re
diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py

index 28404e8cf9cdee58900392c54f7373eb9c47e1be..585aec27a99fbcfa331d3341e6844b3f4803ab11 100644 (file)
--- a/waybacktweets/api/parse_tweets.py
+++ b/waybacktweets/api/parse_tweets.py
@@ -1,10 +1,13 @@
+"""
+Parses the returned data from the Wayback CDX Server API.
+"""
+
  import re
  from concurrent.futures import ThreadPoolExecutor, as_completed
  from contextlib import nullcontext
  from typing import Any, Dict, List, Optional, Tuple
  from urllib.parse import unquote
  
-from requests import exceptions
  from rich import print as rprint
  from rich.progress import Progress
  
@@ -47,49 +50,52 @@ class TwitterEmbed:
              availability statuses, and URLs, respectively. If no tweets are available,
              returns None.
          """
-        try:
-            url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
-            response = get_response(url=url)
-
-            if response:
-                json_response = response.json()
-                html = json_response["html"]
-                author_name = json_response["author_name"]
-
-                regex = re.compile(
-                    r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>',  # noqa
-                    re.DOTALL,
-                )
-                regex_author = re.compile(r"^(.*?)\s*\(")
-
-                matches_html = regex.findall(html)
-
-                tweet_content = []
-                user_info = []
-                is_RT = []
-
-                for match in matches_html:
-                    tweet_content_match = re.sub(
-                        r"<a[^>]*>|<\/a>", "", match[0].strip()
-                    ).replace("<br>", "\n")
-                    user_info_match = re.sub(
-                        r"<a[^>]*>|<\/a>", "", match[1].strip()
-                    ).replace(")", "), ")
-                    match_author = regex_author.search(user_info_match)
-                    author_tweet = match_author.group(1) if match_author else ""
-
-                    if tweet_content_match:
-                        tweet_content.append(tweet_content_match)
-                    if user_info_match:
-                        user_info.append(user_info_match)
-                        is_RT.append(author_name != author_tweet)
-
-                return tweet_content, is_RT, user_info
-        except exceptions:
+        url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
+        response, error, error_type = get_response(url=url)
+
+        if response:
+            json_response = response.json()
+            html = json_response["html"]
+            author_name = json_response["author_name"]
+
+            regex = re.compile(
+                r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>',  # noqa
+                re.DOTALL,
+            )
+            regex_author = re.compile(r"^(.*?)\s*\(")
+
+            matches_html = regex.findall(html)
+
+            tweet_content = []
+            user_info = []
+            is_RT = []
+
+            for match in matches_html:
+                tweet_content_match = re.sub(
+                    r"<a[^>]*>|<\/a>", "", match[0].strip()
+                ).replace("<br>", "\n")
+                user_info_match = re.sub(
+                    r"<a[^>]*>|<\/a>", "", match[1].strip()
+                ).replace(")", "), ")
+                match_author = regex_author.search(user_info_match)
+                author_tweet = match_author.group(1) if match_author else ""
+
+                if tweet_content_match:
+                    tweet_content.append(tweet_content_match)
+                if user_info_match:
+                    user_info.append(user_info_match)
+                    is_RT.append(author_name != author_tweet)
+
+            return tweet_content, is_RT, user_info
+        elif error and error_type == "ConnectionError":
              rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
+        elif error and error_type == "HTTPError":
+            rprint(
+                f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved."  # noqa: E501
+            )
              return None
-        except Exception as e:
-            rprint(f"[red]{e}")
+        elif error and error_type:
+            rprint(f"[red]{error}")
              return None
  
  
@@ -112,33 +118,28 @@ class JsonParser:
  
          :returns: The parsed tweet text.
          """
-        try:
-            response = get_response(url=self.archived_tweet_url)
+        response, error, error_type = get_response(url=self.archived_tweet_url)
  
-            if response:
-                json_data = response.json()
+        if response:
+            json_data = response.json()
  
-                if "data" in json_data:
-                    return json_data["data"].get("text", json_data["data"])
+            if "data" in json_data:
+                return json_data["data"].get("text", json_data["data"])
  
-                if "retweeted_status" in json_data:
-                    return json_data["retweeted_status"].get(
-                        "text", json_data["retweeted_status"]
-                    )
+            if "retweeted_status" in json_data:
+                return json_data["retweeted_status"].get(
+                    "text", json_data["retweeted_status"]
+                )
  
-                return json_data.get("text", json_data)
-        except exceptions.ConnectionError:
+            return json_data.get("text", json_data)
+        elif error and error_type == "ConnectionError":
              rprint(
                  f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
              )
-            return ""
-        except exceptions:
-            rprint("[yellow]Error parsing the JSON, but the CDX data was saved.")
-
-            return ""
-        except Exception as e:
-            rprint(f"[red]{e}")
-            return ""
+            return None
+        elif error and error_type:
+            rprint(f"[red]{error}")
+            return None
  
  
  class TweetsParser:
diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py

index 64ea35f7f002a02e67f549db48eb8816da91d5d5..a215b6a3ae5b2a5705bcb9c4d2882b5ea32d61e2 100644 (file)
--- a/waybacktweets/api/request_tweets.py
+++ b/waybacktweets/api/request_tweets.py
@@ -1,6 +1,9 @@
+"""
+Requests data from the Wayback Machine API.
+"""
+
  from typing import Any, Dict, Optional
  
-from requests import exceptions
  from rich import print as rprint
  
  from waybacktweets.utils.utils import get_response
@@ -73,20 +76,19 @@ class WaybackTweets:
          if self.matchtype:
              params["matchType"] = self.matchtype
  
-        try:
-            response = get_response(url=url, params=params)
+        response, error, error_type = get_response(url=url, params=params)
  
-            if response:
-                return response.json()
-        except exceptions.ReadTimeout:
+        if response:
+            return response.json()
+        elif error and error_type == "ReadTimeout":
              rprint("[red]Connection to web.archive.org timed out.")
-        except exceptions.ConnectionError:
+        elif error and error_type == "ConnectionError":
              rprint(
                  "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
              )
-        except exceptions.HTTPError:
+        elif error and error_type == "HTTPError":
+            rprint("[red]Connection to web.archive.org timed out.")
+        elif error and error_type:
              rprint(
                  "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information."  # noqa: E501
              )
-        except Exception as e:
-            rprint(f"[red]{e}")
diff --git a/waybacktweets/api/viz_tweets.py b/waybacktweets/api/viz_tweets.py

index f5a68cc33a1eaebac72dc54c8e58d7da52364f1b..ef13aac645859edc97c1bf9f6e1c8343f19dce35 100644 (file)
--- a/waybacktweets/api/viz_tweets.py
+++ b/waybacktweets/api/viz_tweets.py
@@ -1,4 +1,8 @@
  # flake8: noqa: E501
+"""
+Generates an HTML file to visualize the parsed data.
+"""
+
  import json
  from typing import Any, Dict, List
  
diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py

index 19fbdcb2502e393333a82b4a5f1ecd9f3bb7902f..7c31e6ad279f45fe500fc9feac3f9a9a544f962d 100644 (file)
--- a/waybacktweets/cli/main.py
+++ b/waybacktweets/cli/main.py
@@ -6,7 +6,6 @@ from datetime import datetime
  from typing import Any, Optional
  
  import click
-from requests import exceptions
  from rich import print as rprint
  
  from waybacktweets.api.export_tweets import TweetsExporter
@@ -128,7 +127,7 @@ def cli(
              exporter.save_to_csv()
              exporter.save_to_json()
              exporter.save_to_html()
-    except exceptions as e:
+    except Exception as e:
          rprint(f"[red]{e}")
      finally:
          rprint(
diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py

index cf5fe9387840ecf95d2a4edf7c9248086be9f7a6..26a1d88d9224adb7541f3df75ad79df5f66dcb45 100644 (file)
--- a/waybacktweets/utils/utils.py
+++ b/waybacktweets/utils/utils.py
@@ -1,9 +1,9 @@
  """
-Module containing utility functions for handling HTTP requests and manipulating URLs.
+Utility functions for handling HTTP requests and manipulating URLs.
  """
  
  import re
-from typing import Optional
+from typing import Optional, Tuple
  
  import requests
  from requests.adapters import HTTPAdapter
@@ -12,16 +12,16 @@ from urllib3.util.retry import Retry
  
  def get_response(
      url: str, params: Optional[dict] = None
-) -> Optional[requests.Response]:
+) -> Tuple[Optional[requests.Response], Optional[str], Optional[str]]:
      """
-    Sends a GET request to the specified URL and returns the response.
+    Sends a GET request to the specified URL and returns the response,
+    an error message if any, and the type of exception if any.
  
      :param url: The URL to send the GET request to.
      :param params: The parameters to include in the GET request.
  
-    :returns: The response from the server,
-        if the status code is not in the 400-511 range.
-        If the status code is in the 400-511 range.
+    :returns: A tuple containing the response from the server or None,
+              an error message or None, and the type of exception or None.
      """
      session = requests.Session()
      retry = Retry(connect=3, backoff_factor=0.3)
@@ -33,12 +33,17 @@ def get_response(
      session.mount("http://", adapter)
      session.mount("https://", adapter)
  
-    response = session.get(url, params=params, headers=headers)
-
-    if 400 <= response.status_code <= 511:
-        return None
-
-    return response
+    try:
+        response = session.get(url, params=params, headers=headers)
+        response.raise_for_status()
+
+        if not response or response.json() == []:
+            return None, "No data was saved due to an empty response.", None
+        return response, None, None
+    except requests.exceptions.RequestException as e:
+        return None, str(e), type(e).__name__
+    except Exception as e:
+        return None, str(e), type(e).__name__
  
  
  def clean_tweet_url(tweet_url: str, username: str) -> str:
author	Claromes <claromes@hey.com>
	Sun, 16 Jun 2024 08:55:01 +0000 (05:55 -0300)
committer	Claromes <claromes@hey.com>
	Sun, 16 Jun 2024 08:55:01 +0000 (05:55 -0300)
README.md		patch \| blob \| history
app/app.py		patch \| blob \| history
docs/conf.py		patch \| blob \| history
docs/errors.rst	[deleted file]	patch \| blob \| history
docs/exceptions.rst	[new file with mode: 0644]	patch \| blob
docs/index.rst		patch \| blob \| history
docs/quickstart.rst		patch \| blob \| history
docs/streamlit.rst		patch \| blob \| history
docs/workflow.rst		patch \| blob \| history
waybacktweets/api/export_tweets.py		patch \| blob \| history
waybacktweets/api/parse_tweets.py		patch \| blob \| history
waybacktweets/api/request_tweets.py		patch \| blob \| history
waybacktweets/api/viz_tweets.py		patch \| blob \| history
waybacktweets/cli/main.py		patch \| blob \| history
waybacktweets/utils/utils.py		patch \| blob \| history