From e9f4367e43b1d45f03891df4b258a898a2926dce Mon Sep 17 00:00:00 2001 From: Claromes Date: Wed, 12 Jun 2024 05:46:34 -0300 Subject: [PATCH] Review JSON requests --- app/{new_app.py => app.py} | 23 ++++++++++++++++------- waybacktweets/cli.py | 3 ++- waybacktweets/parse_tweets.py | 29 +++++++++++------------------ waybacktweets/request_tweets.py | 16 ++++++++++------ waybacktweets/utils.py | 22 ++++++++++++++++++++++ waybacktweets/viz_tweets.py | 17 +++++++++++++++-- 6 files changed, 76 insertions(+), 34 deletions(-) rename app/{new_app.py => app.py} (94%) diff --git a/app/new_app.py b/app/app.py similarity index 94% rename from app/new_app.py rename to app/app.py index 6f3eabf..329034e 100644 --- a/app/new_app.py +++ b/app/app.py @@ -7,7 +7,7 @@ import streamlit.components.v1 as components from waybacktweets.export_tweets import TweetsExporter from waybacktweets.parse_tweets import TweetsParser from waybacktweets.request_tweets import WaybackTweets -from waybacktweets.utils import check_double_status +from waybacktweets.utils import check_double_status, get_response # Initial Settings @@ -111,7 +111,7 @@ def tweets_count(username, archived_timestamp_filter): url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 try: - response = requests.get(url) + response = get_response(url=url) if response.status_code == 200: data = response.json() @@ -282,9 +282,18 @@ if query or st.session_state.count: st.divider() - # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 + # Display tweets not available with text/html, unk, warc/revisit MIME type or application/json MIME type without parsed JSON text # noqa: E501 elif ( - archived_mimetype[i] != "application/json" + ( + archived_mimetype[i] != "application/json" + and not parsed_tweet_text_mimetype_json[i] + ) + and not available_tweet_text[i] + ) or ( + ( + archived_mimetype[i] == "application/json" + and not parsed_tweet_text_mimetype_json[i] + ) and not available_tweet_text[i] ): if ( @@ -319,11 +328,11 @@ if query or st.session_state.count: st.divider() - # Display tweets not available with application/json return # noqa: E501 + # Display tweets not available with application/json MIME type and parsed JSON text # noqa: E501 elif ( archived_mimetype[i] == "application/json" - and not available_tweet_text[i] - ): + and parsed_tweet_text_mimetype_json[i] + ) and not available_tweet_text[i]: st.code(parsed_tweet_text_mimetype_json[i]) # st.json(json_data, expanded=False) diff --git a/waybacktweets/cli.py b/waybacktweets/cli.py index ebaebc8..23596ec 100644 --- a/waybacktweets/cli.py +++ b/waybacktweets/cli.py @@ -5,6 +5,7 @@ CLI functions for retrieving archived tweets. from datetime import datetime import click +from requests import exceptions from rich import print as rprint from waybacktweets.export_tweets import TweetsExporter @@ -83,7 +84,7 @@ def cli(username, unique, timestamp_from, timestamp_to, limit): exporter.save_to_json() exporter.save_to_html() - except TypeError as e: + except exceptions as e: rprint(f"[red]{e}") finally: rprint( diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py index 76ad899..f182d24 100644 --- a/waybacktweets/parse_tweets.py +++ b/waybacktweets/parse_tweets.py @@ -1,9 +1,8 @@ import re -import time from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import unquote -import requests +from requests import exceptions from rich import print as rprint from rich.progress import Progress @@ -12,6 +11,7 @@ from waybacktweets.utils import ( check_pattern_tweet, clean_tweet_url, delete_tweet_pathnames, + get_response, semicolon_parser, ) @@ -26,7 +26,7 @@ class TwitterEmbed: """Parses the archived tweets when they are still available.""" try: url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" - response = requests.get(url) + response = get_response(url=url) if response: json_response = response.json() @@ -62,7 +62,7 @@ class TwitterEmbed: is_RT.append(author_name != author_tweet) return tweet_content, is_RT, user_info - except Exception: + except exceptions: rprint("[yellow]Error parsing the tweet, but the CDX data was saved.") return None @@ -75,18 +75,8 @@ class JsonParser: def parse(self): """Parses the archived tweets in JSON format.""" - - max_attempts = 5 try: - for attempt in range(max_attempts): - try: - response = requests.get(self.archived_tweet_url) - break - except requests.exceptions.ConnectionError: - if attempt < max_attempts - 1: - time.sleep(0.5) - else: - raise + response = get_response(url=self.archived_tweet_url) if response: json_data = response.json() @@ -100,10 +90,13 @@ class JsonParser: ) return json_data.get("text", json_data) - except Exception: + except exceptions.ConnectionError: rprint( - f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved." # noqa: E501 + f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501 ) + return "" + except exceptions: + rprint("[yellow]Error parsing the JSON, but the CDX data was saved.") return "" @@ -199,7 +192,7 @@ class TweetsParser: try: future.result() except Exception as e: - rprint(f"[red]{e}") + rprint(f"[red]{e}...") progress.update(task, advance=1) diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index 0093629..b6c561e 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -1,6 +1,8 @@ -import requests +from requests import exceptions from rich import print as rprint +from waybacktweets.utils import get_response + class WaybackTweets: """Requests data from the Wayback CDX Server API and returns it in JSON format.""" @@ -35,15 +37,17 @@ class WaybackTweets: print("Making a request to the Internet Archive...") try: - response = requests.get(url, params=params) + response = get_response(url=url, params=params) if response: return response.json() - except requests.exceptions.ReadTimeout: + except exceptions.ReadTimeout: rprint("[red]Connection to web.archive.org timed out.") - except requests.exceptions.ConnectionError: - rprint("[red]Failed to establish a new connection with web.archive.org.") - except requests.exceptions.HTTPError: + except exceptions.ConnectionError: + rprint( + "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded." # noqa: E501 + ) + except exceptions.HTTPError: rprint( "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501 ) diff --git a/waybacktweets/utils.py b/waybacktweets/utils.py index 822a5dd..65c74a2 100644 --- a/waybacktweets/utils.py +++ b/waybacktweets/utils.py @@ -4,6 +4,28 @@ Helper functions. import re +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + + +def get_response(url, params=None): + """Sends a GET request to the specified URL and returns the response.""" + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.3) + adapter = HTTPAdapter(max_retries=retry) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" # noqa: E501 + } + + session.mount("http://", adapter) + session.mount("https://", adapter) + + response = session.get(url, params=params, headers=headers) + + if not 400 <= response.status_code <= 511: + return response + def clean_tweet_url(tweet_url, username): """ diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py index 1e803fa..5434980 100644 --- a/waybacktweets/viz_tweets.py +++ b/waybacktweets/viz_tweets.py @@ -18,6 +18,7 @@ class HTMLTweetsVisualizer: def generate(self): """Generates an HTML file.""" + html = f"\n\n@{self.username} archived tweets\n" html += "