From: Claromes Date: Sun, 16 Jun 2024 08:55:01 +0000 (-0300) Subject: update docs, review exceptions X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=b1bbea09cce7969dabaf18b4d871ed2334af18a2;p=waybacktweets.git update docs, review exceptions --- diff --git a/README.md b/README.md index 5a8890f..1ddd1b3 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ if archived_tweets: ### Using Wayback Tweets as a Web App -[Access the application](https://waybacktweets.streamlit.app), a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. +[Open the application](https://waybacktweets.streamlit.app), a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. ## Documentation diff --git a/app/app.py b/app/app.py index e0245da..87d630f 100644 --- a/app/app.py +++ b/app/app.py @@ -1,6 +1,5 @@ import datetime -import requests import streamlit as st import streamlit.components.v1 as components @@ -26,11 +25,11 @@ st.set_page_config( "About": f""" [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets) - Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually. + Application that displays multiple archived tweets on Wayback Machine to avoid opening each link manually. The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html). - © Copyright 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library + © 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License --- """, # noqa: E501 @@ -129,24 +128,23 @@ def next_page(): def tweets_count(username, archived_timestamp_filter): url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 - try: - response = get_response(url=url) - - if response.status_code == 200: - data = response.json() - if data and len(data) > 1: - total_tweets = len(data) - 1 - return total_tweets - else: - return 0 - except requests.exceptions.ReadTimeout: - st.error("Connection to web.archive.org timed out.") + response, error, error_type = get_response(url=url) + + if response.status_code == 200: + data = response.json() + if data and len(data) > 1: + total_tweets = len(data) - 1 + return total_tweets + else: + return 0 + elif error and error_type == "ReadTimeout": + st.error("Failed to establish a new connection with web.archive.org.") st.stop() - except requests.exceptions.ConnectionError: + elif error and error_type == "ConnectionError": st.error("Failed to establish a new connection with web.archive.org.") st.stop() - except Exception as e: - st.error(f"{e}") + elif error and error_type: + st.error(f"{error}") st.stop() diff --git a/docs/conf.py b/docs/conf.py index 643113f..5692fea 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,7 +4,7 @@ from pallets_sphinx_themes import ProjectLink, get_version project = "Wayback Tweets" release, version = get_version("waybacktweets") -copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title Font by Google, licensed under the Open Font License · Wayback Tweets v{version}" # noqa: E501 +copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License · Wayback Tweets v{version}" # noqa: E501 author = "Claromes" # -- General configuration --------------------------------------------------- diff --git a/docs/errors.rst b/docs/errors.rst deleted file mode 100644 index 38a8f1b..0000000 --- a/docs/errors.rst +++ /dev/null @@ -1,32 +0,0 @@ -Errors -================ - -These are the most common errors and are handled by the ``waybacktweets`` package. - -ReadTimeout ----------------- - -This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues. - -The output message from the package would be: ``Connection to web.archive.org timed out.`` - -ConnectionError ----------------- - -This error is raised when the package fails to establish a new connection with web.archive.org. This could be due to network issues or the server being down. - -The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.`` - - -This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``. - -The warning output message from the package would be: ``Connection error with https://web.archive.org/web//https://twitter.com//status/. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.`` - -HTTPError ----------------- - -This error occurs when the Internet Archive services are temporarily offline. This could be due to maintenance or server issues. - -The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.`` - - diff --git a/docs/exceptions.rst b/docs/exceptions.rst new file mode 100644 index 0000000..109e41b --- /dev/null +++ b/docs/exceptions.rst @@ -0,0 +1,32 @@ +Exceptions +================ + +These are the most common errors and are handled by the ``waybacktweets`` package. + +ReadTimeout +---------------- + +This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues. + +The output message from the package would be: ``Connection to web.archive.org timed out.`` + +ConnectionError +---------------- + +This error is raised when the package fails to establish a new connection with web.archive.org. This could be due to network issues or the server being down. + +The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.`` + + +This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``. + +The warning output message from the package would be: ``Connection error with https://web.archive.org/web//https://twitter.com//status/. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.`` + +HTTPError +---------------- + +This error occurs when the Internet Archive services are temporarily offline. This could be due to maintenance or server issues. + +The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.`` + + diff --git a/docs/index.rst b/docs/index.rst index c6e5175..9bf5bcd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,7 +19,7 @@ User Guide quickstart workflow result - errors + exceptions contribute todo diff --git a/docs/quickstart.rst b/docs/quickstart.rst index f98a503..4e3c4d7 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -10,7 +10,7 @@ waybacktweets [OPTIONS] USERNAME .. code-block:: shell - waybacktweets --from 20150101 --to 20191231 --limit 250 jack` + waybacktweets --from 20150101 --to 20191231 --limit 250 jack Module @@ -46,4 +46,4 @@ Web App Using Wayback Tweets as a Streamlit Web App -`Access the application `_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. +`Open the application `_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. diff --git a/docs/streamlit.rst b/docs/streamlit.rst index 78da866..b8de7d9 100644 --- a/docs/streamlit.rst +++ b/docs/streamlit.rst @@ -3,6 +3,8 @@ Web App Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually. The application is a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. +`Open the application `_. + Filters ---------- diff --git a/docs/workflow.rst b/docs/workflow.rst index c3ffd32..2480b35 100644 --- a/docs/workflow.rst +++ b/docs/workflow.rst @@ -19,5 +19,5 @@ Use the mouse to zoom in and out the flowchart. C--> |4xx| E[return None] E--> F{request Archived\nTweet URL} F--> |4xx| G[return Only CDX data] - F--> |TODO: 2xx/3xx: application/json| J[return JSON text] + F--> |2xx/3xx: application/json| J[return JSON text] F--> |2xx/3xx: text/html, warc/revisit, unk| K[return HTML iframe tag] diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py index 4cd5d83..a6daf41 100644 --- a/waybacktweets/api/export_tweets.py +++ b/waybacktweets/api/export_tweets.py @@ -1,3 +1,7 @@ +""" +Exports the parsed archived tweets. +""" + import datetime import os import re diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py index 28404e8..585aec2 100644 --- a/waybacktweets/api/parse_tweets.py +++ b/waybacktweets/api/parse_tweets.py @@ -1,10 +1,13 @@ +""" +Parses the returned data from the Wayback CDX Server API. +""" + import re from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import nullcontext from typing import Any, Dict, List, Optional, Tuple from urllib.parse import unquote -from requests import exceptions from rich import print as rprint from rich.progress import Progress @@ -47,49 +50,52 @@ class TwitterEmbed: availability statuses, and URLs, respectively. If no tweets are available, returns None. """ - try: - url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" - response = get_response(url=url) - - if response: - json_response = response.json() - html = json_response["html"] - author_name = json_response["author_name"] - - regex = re.compile( - r'