From e9f4367e43b1d45f03891df4b258a898a2926dce Mon Sep 17 00:00:00 2001
From: Claromes <claromes@hey.com>
Date: Wed, 12 Jun 2024 05:46:34 -0300
Subject: [PATCH] Review JSON requests

---
 app/{new_app.py => app.py}      | 23 ++++++++++++++++-------
 waybacktweets/cli.py            |  3 ++-
 waybacktweets/parse_tweets.py   | 29 +++++++++++------------------
 waybacktweets/request_tweets.py | 16 ++++++++++------
 waybacktweets/utils.py          | 22 ++++++++++++++++++++++
 waybacktweets/viz_tweets.py     | 17 +++++++++++++++--
 6 files changed, 76 insertions(+), 34 deletions(-)
 rename app/{new_app.py => app.py} (94%)

diff --git a/app/new_app.py b/app/app.py
similarity index 94%
rename from app/new_app.py
rename to app/app.py
index 6f3eabf..329034e 100644
--- a/app/new_app.py
+++ b/app/app.py
@@ -7,7 +7,7 @@ import streamlit.components.v1 as components
 from waybacktweets.export_tweets import TweetsExporter
 from waybacktweets.parse_tweets import TweetsParser
 from waybacktweets.request_tweets import WaybackTweets
-from waybacktweets.utils import check_double_status
+from waybacktweets.utils import check_double_status, get_response
 
 # Initial Settings
 
@@ -111,7 +111,7 @@ def tweets_count(username, archived_timestamp_filter):
     url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
 
     try:
-        response = requests.get(url)
+        response = get_response(url=url)
 
         if response.status_code == 200:
             data = response.json()
@@ -282,9 +282,18 @@ if query or st.session_state.count:
 
                                     st.divider()
 
-                                # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
+                                # Display tweets not available with text/html, unk, warc/revisit MIME type or application/json MIME type without parsed JSON text # noqa: E501
                                 elif (
-                                    archived_mimetype[i] != "application/json"
+                                    (
+                                        archived_mimetype[i] != "application/json"
+                                        and not parsed_tweet_text_mimetype_json[i]
+                                    )
+                                    and not available_tweet_text[i]
+                                ) or (
+                                    (
+                                        archived_mimetype[i] == "application/json"
+                                        and not parsed_tweet_text_mimetype_json[i]
+                                    )
                                     and not available_tweet_text[i]
                                 ):
                                     if (
@@ -319,11 +328,11 @@ if query or st.session_state.count:
 
                                     st.divider()
 
-                                # Display tweets not available with application/json return # noqa: E501
+                                # Display tweets not available with application/json MIME type and parsed JSON text # noqa: E501
                                 elif (
                                     archived_mimetype[i] == "application/json"
-                                    and not available_tweet_text[i]
-                                ):
+                                    and parsed_tweet_text_mimetype_json[i]
+                                ) and not available_tweet_text[i]:
                                     st.code(parsed_tweet_text_mimetype_json[i])
                                     # st.json(json_data, expanded=False)
 
diff --git a/waybacktweets/cli.py b/waybacktweets/cli.py
index ebaebc8..23596ec 100644
--- a/waybacktweets/cli.py
+++ b/waybacktweets/cli.py
@@ -5,6 +5,7 @@ CLI functions for retrieving archived tweets.
 from datetime import datetime
 
 import click
+from requests import exceptions
 from rich import print as rprint
 
 from waybacktweets.export_tweets import TweetsExporter
@@ -83,7 +84,7 @@ def cli(username, unique, timestamp_from, timestamp_to, limit):
             exporter.save_to_json()
             exporter.save_to_html()
 
-    except TypeError as e:
+    except exceptions as e:
         rprint(f"[red]{e}")
     finally:
         rprint(
diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py
index 76ad899..f182d24 100644
--- a/waybacktweets/parse_tweets.py
+++ b/waybacktweets/parse_tweets.py
@@ -1,9 +1,8 @@
 import re
-import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import unquote
 
-import requests
+from requests import exceptions
 from rich import print as rprint
 from rich.progress import Progress
 
@@ -12,6 +11,7 @@ from waybacktweets.utils import (
     check_pattern_tweet,
     clean_tweet_url,
     delete_tweet_pathnames,
+    get_response,
     semicolon_parser,
 )
 
@@ -26,7 +26,7 @@ class TwitterEmbed:
         """Parses the archived tweets when they are still available."""
         try:
             url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
-            response = requests.get(url)
+            response = get_response(url=url)
 
             if response:
                 json_response = response.json()
@@ -62,7 +62,7 @@ class TwitterEmbed:
                         is_RT.append(author_name != author_tweet)
 
                 return tweet_content, is_RT, user_info
-        except Exception:
+        except exceptions:
             rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
             return None
 
@@ -75,18 +75,8 @@ class JsonParser:
 
     def parse(self):
         """Parses the archived tweets in JSON format."""
-
-        max_attempts = 5
         try:
-            for attempt in range(max_attempts):
-                try:
-                    response = requests.get(self.archived_tweet_url)
-                    break
-                except requests.exceptions.ConnectionError:
-                    if attempt < max_attempts - 1:
-                        time.sleep(0.5)
-                    else:
-                        raise
+            response = get_response(url=self.archived_tweet_url)
 
             if response:
                 json_data = response.json()
@@ -100,10 +90,13 @@ class JsonParser:
                     )
 
                 return json_data.get("text", json_data)
-        except Exception:
+        except exceptions.ConnectionError:
             rprint(
-                f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
+                f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
             )
+            return ""
+        except exceptions:
+            rprint("[yellow]Error parsing the JSON, but the CDX data was saved.")
 
             return ""
 
@@ -199,7 +192,7 @@ class TweetsParser:
                     try:
                         future.result()
                     except Exception as e:
-                        rprint(f"[red]{e}")
+                        rprint(f"[red]{e}...")
 
                     progress.update(task, advance=1)
 
diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py
index 0093629..b6c561e 100644
--- a/waybacktweets/request_tweets.py
+++ b/waybacktweets/request_tweets.py
@@ -1,6 +1,8 @@
-import requests
+from requests import exceptions
 from rich import print as rprint
 
+from waybacktweets.utils import get_response
+
 
 class WaybackTweets:
     """Requests data from the Wayback CDX Server API and returns it in JSON format."""
@@ -35,15 +37,17 @@ class WaybackTweets:
         print("Making a request to the Internet Archive...")
 
         try:
-            response = requests.get(url, params=params)
+            response = get_response(url=url, params=params)
 
             if response:
                 return response.json()
-        except requests.exceptions.ReadTimeout:
+        except exceptions.ReadTimeout:
             rprint("[red]Connection to web.archive.org timed out.")
-        except requests.exceptions.ConnectionError:
-            rprint("[red]Failed to establish a new connection with web.archive.org.")
-        except requests.exceptions.HTTPError:
+        except exceptions.ConnectionError:
+            rprint(
+                "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded."  # noqa: E501
+            )
+        except exceptions.HTTPError:
             rprint(
                 "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information."  # noqa: E501
             )
diff --git a/waybacktweets/utils.py b/waybacktweets/utils.py
index 822a5dd..65c74a2 100644
--- a/waybacktweets/utils.py
+++ b/waybacktweets/utils.py
@@ -4,6 +4,28 @@ Helper functions.
 
 import re
 
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+
+def get_response(url, params=None):
+    """Sends a GET request to the specified URL and returns the response."""
+    session = requests.Session()
+    retry = Retry(connect=3, backoff_factor=0.3)
+    adapter = HTTPAdapter(max_retries=retry)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"  # noqa: E501
+    }
+
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+
+    response = session.get(url, params=params, headers=headers)
+
+    if not 400 <= response.status_code <= 511:
+        return response
+
 
 def clean_tweet_url(tweet_url, username):
     """
diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py
index 1e803fa..5434980 100644
--- a/waybacktweets/viz_tweets.py
+++ b/waybacktweets/viz_tweets.py
@@ -18,6 +18,7 @@ class HTMLTweetsVisualizer:
 
     def generate(self):
         """Generates an HTML file."""
+
         html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
         html += "<style>\n"
         html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
@@ -38,7 +39,16 @@ class HTMLTweetsVisualizer:
             html += '<div class="tweet">\n'
 
             if (
-                tweet["archived_mimetype"] != "application/json"
+                (
+                    tweet["archived_mimetype"] != "application/json"
+                    and not tweet["parsed_tweet_text_mimetype_json"]
+                )
+                and not tweet["available_tweet_text"]
+            ) or (
+                (
+                    tweet["archived_mimetype"] == "application/json"
+                    and not tweet["parsed_tweet_text_mimetype_json"]
+                )
                 and not tweet["available_tweet_text"]
             ):
                 html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
@@ -54,7 +64,10 @@ class HTMLTweetsVisualizer:
                 html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
                 html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
 
-            if tweet["archived_mimetype"] == "application/json":
+            if (
+                tweet["archived_mimetype"] == "application/json"
+                and tweet["parsed_tweet_text_mimetype_json"]
+            ) and not tweet["available_tweet_text"]:
                 html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'
 
             html += "<br>\n"
-- 
2.34.1