Review JSON requests
authorClaromes <claromes@hey.com>
Wed, 12 Jun 2024 08:46:34 +0000 (05:46 -0300)
committerClaromes <claromes@hey.com>
Wed, 12 Jun 2024 08:46:34 +0000 (05:46 -0300)
app/app.py [new file with mode: 0644]
app/new_app.py [deleted file]
waybacktweets/cli.py
waybacktweets/parse_tweets.py
waybacktweets/request_tweets.py
waybacktweets/utils.py
waybacktweets/viz_tweets.py

diff --git a/app/app.py b/app/app.py
new file mode 100644 (file)
index 0000000..329034e
--- /dev/null
@@ -0,0 +1,436 @@
+import datetime
+
+import requests
+import streamlit as st
+import streamlit.components.v1 as components
+
+from waybacktweets.export_tweets import TweetsExporter
+from waybacktweets.parse_tweets import TweetsParser
+from waybacktweets.request_tweets import WaybackTweets
+from waybacktweets.utils import check_double_status, get_response
+
+# Initial Settings
+
+LOGO = "app/assets/parthenon.svg"
+
+st.set_page_config(
+    page_title="Wayback Tweets",
+    page_icon=LOGO,
+    layout="centered",
+    menu_items={
+        "Report a bug": "https://github.com/claromes/waybacktweets/issues",
+    },
+)
+
+# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
+st.html(
+    """
+<style>
+    header[data-testid="stHeader"] {
+        opacity: 0.5;
+    }
+     iframe {
+        border: 1px solid #dddddd;
+        border-radius: 0.5rem;
+    }
+    div[data-testid="InputInstructions"] {
+        visibility: hidden;
+    }
+    img[data-testid="stLogo"] {
+        scale: 3;
+        padding-left: 10px;
+    }
+</style>
+"""
+)
+
+if "current_username" not in st.session_state:
+    st.session_state.current_username = ""
+
+if "prev_disabled" not in st.session_state:
+    st.session_state.prev_disabled = False
+
+if "next_disabled" not in st.session_state:
+    st.session_state.next_disabled = False
+
+if "next_button" not in st.session_state:
+    st.session_state.next_button = False
+
+if "prev_button" not in st.session_state:
+    st.session_state.prev_button = False
+
+if "update_component" not in st.session_state:
+    st.session_state.update_component = 0
+
+if "offset" not in st.session_state:
+    st.session_state.offset = 0
+
+if "count" not in st.session_state:
+    st.session_state.count = False
+
+start_date = datetime.datetime(2006, 3, 1)
+end_date = datetime.datetime.now()
+
+if "archived_timestamp_filter" not in st.session_state:
+    st.session_state.archived_timestamp_filter = (start_date, end_date)
+
+
+# Pagination Settings
+
+
+def scroll_into_view():
+    script = f"""
+    <script>
+        window.parent.document.querySelector('section.main').scrollTo(0, 0);
+        let update_component = {st.session_state.update_component}
+    </script>
+    """
+
+    components.html(script, width=0, height=0)
+
+
+def prev_page():
+    st.session_state.offset -= tweets_per_page
+
+    st.session_state.update_component += 1
+    scroll_into_view()
+
+
+def next_page():
+    st.session_state.offset += tweets_per_page
+
+    st.session_state.update_component += 1
+    scroll_into_view()
+
+
+# Requesting
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def tweets_count(username, archived_timestamp_filter):
+    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
+
+    try:
+        response = get_response(url=url)
+
+        if response.status_code == 200:
+            data = response.json()
+            if data and len(data) > 1:
+                total_tweets = len(data) - 1
+                return total_tweets
+            else:
+                return 0
+    except requests.exceptions.Timeout:
+        st.error("Connection to web.archive.org timed out.")
+        st.stop()
+    except requests.exceptions.ConnectionError:
+        st.error("Failed to establish a new connection with web.archive.org.")
+        st.stop()
+    except UnboundLocalError:
+        st.empty()
+
+
+# Interface Settings
+
+
+st.logo(LOGO)
+
+st.success(
+    """**New Feature: CLI**
+
+You can now retrieve archived tweets using the Wayback Tweets command line tool.
+Download the archived tweets' CDX data in CSV, JSON, and HTML formats.
+
+For more details, [read the documentation](https://github.com/claromes/waybacktweets)."""  # noqa: E501
+)
+
+st.title(
+    "Wayback Tweets",
+    anchor=False,
+)
+st.caption(
+    "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)"  # noqa: E501
+)
+st.caption("Display multiple archived tweets on Wayback Machine")
+
+username = st.text_input("Username", placeholder="Without @")
+
+start_date = datetime.datetime(2006, 3, 1)
+end_date = datetime.datetime.now()
+
+st.session_state.archived_timestamp_filter = st.date_input(
+    "Tweets saved between",
+    (start_date, end_date),
+    start_date,
+    end_date,
+    format="YYYY/MM/DD",
+    help="YYYY/MM/DD",
+)
+
+not_available = st.checkbox("Only tweets not available")
+
+unique = st.checkbox(
+    "Only unique URLs",
+    help="Filtering by the collapse option using the urlkey field",
+)
+
+query = st.button("Query", type="primary", use_container_width=True)
+
+# Tweet Listing Settings
+
+
+if username != st.session_state.current_username:
+    st.session_state.current_username = username
+    st.session_state.offset = 0
+
+if query or st.session_state.count:
+    tweets_per_page = 25
+
+    st.session_state.count = tweets_count(
+        username, st.session_state.archived_timestamp_filter
+    )
+
+    st.caption(
+        "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
+    )
+    st.write(f"**{st.session_state.count} URLs have been captured**")
+
+    if st.session_state.count:
+        if tweets_per_page > st.session_state.count:
+            tweets_per_page = st.session_state.count
+
+    try:
+        progress = st.empty()
+
+        # Tweet Listing Processing
+
+        response = WaybackTweets(
+            username,
+            unique,
+            st.session_state.archived_timestamp_filter[0],
+            st.session_state.archived_timestamp_filter[1],
+            tweets_per_page,
+        )
+        archived_tweets = response.get()
+
+        with st.spinner("Parsing..."):
+            if archived_tweets:
+                field_options = [
+                    "archived_urlkey",
+                    "archived_timestamp",
+                    "original_tweet_url",
+                    "archived_tweet_url",
+                    "parsed_tweet_url",
+                    "parsed_archived_tweet_url",
+                    "parsed_tweet_text_mimetype_json",
+                    "available_tweet_text",
+                    "available_tweet_is_RT",
+                    "available_tweet_info",
+                    "archived_mimetype",
+                    "archived_statuscode",
+                ]
+
+                parser = TweetsParser(archived_tweets, username, field_options)
+                parsed_tweets = parser.parse()
+
+                exporter = TweetsExporter(parsed_tweets, username, field_options)
+                df = exporter.dataframe
+
+                # file_path = "claromes_tweets_20240610210338.csv"
+                # df = pd.read_csv(file_path)
+                # df = df.fillna("")
+
+                archived_urlkey = df["archived_urlkey"]
+                archived_timestamp = df["archived_timestamp"]
+                original_tweet_url = df["original_tweet_url"]
+                archived_tweet_url = df["archived_tweet_url"]
+                parsed_tweet_url = df["parsed_tweet_url"]
+                parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
+                parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"]
+                available_tweet_text = df["available_tweet_text"]
+                available_tweet_is_RT = df["available_tweet_is_RT"]
+                available_tweet_info = df["available_tweet_info"]
+                archived_mimetype = df["archived_mimetype"]
+                archived_statuscode = df["archived_statuscode"]
+
+                st.divider()
+
+                st.session_state.current_username = username
+
+                return_none_count = 0
+
+                start_index = st.session_state.offset
+                end_index = min(st.session_state.count, start_index + tweets_per_page)
+
+                for i in range(tweets_per_page):
+                    try:
+                        if original_tweet_url[i]:
+
+                            # Display all tweets
+                            if not not_available:
+                                st.markdown(
+                                    f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
+                                )
+
+                                # Display available tweets
+                                if available_tweet_text[i]:
+                                    if available_tweet_is_RT[i]:
+                                        st.info("*Retweet*")
+
+                                    st.write(available_tweet_text[i])
+                                    st.write(f"**{available_tweet_info[i]}**")
+
+                                    st.divider()
+
+                                # Display tweets not available with text/html, unk, warc/revisit MIME type or application/json MIME type without parsed JSON text # noqa: E501
+                                elif (
+                                    (
+                                        archived_mimetype[i] != "application/json"
+                                        and not parsed_tweet_text_mimetype_json[i]
+                                    )
+                                    and not available_tweet_text[i]
+                                ) or (
+                                    (
+                                        archived_mimetype[i] == "application/json"
+                                        and not parsed_tweet_text_mimetype_json[i]
+                                    )
+                                    and not available_tweet_text[i]
+                                ):
+                                    if (
+                                        ".jpg" in original_tweet_url[i]
+                                        or ".png" in original_tweet_url[i]
+                                    ) and (400 <= archived_statuscode[i] <= 511):
+                                        components.iframe(
+                                            archived_tweet_url[i],
+                                            height=500,
+                                            scrolling=True,
+                                        )
+                                    elif "/status/" not in original_tweet_url[i]:
+                                        st.info(
+                                            "This isn't a status or is not available"
+                                        )
+                                    elif (
+                                        check_double_status(
+                                            archived_tweet_url[i], original_tweet_url[i]
+                                        )
+                                        or f"{st.session_state.current_username}"
+                                        not in original_tweet_url[i]
+                                    ):
+                                        st.info(
+                                            f"Replying to {st.session_state.current_username}"  # noqa: E501
+                                        )
+                                    else:
+                                        components.iframe(
+                                            archived_tweet_url[i],
+                                            height=500,
+                                            scrolling=True,
+                                        )
+
+                                    st.divider()
+
+                                # Display tweets not available with application/json MIME type and parsed JSON text # noqa: E501
+                                elif (
+                                    archived_mimetype[i] == "application/json"
+                                    and parsed_tweet_text_mimetype_json[i]
+                                ) and not available_tweet_text[i]:
+                                    st.code(parsed_tweet_text_mimetype_json[i])
+                                    # st.json(json_data, expanded=False)
+
+                                    st.divider()
+
+                            # Display only tweets not available
+                            if not_available:
+                                return_none_count += 1
+
+                                st.markdown(
+                                    f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
+                                )
+
+                                # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
+                                if (
+                                    archived_mimetype[i] != "application/json"
+                                    and not available_tweet_text[i]
+                                ):
+                                    if (
+                                        ".jpg" in original_tweet_url[i]
+                                        or ".png" in original_tweet_url[i]
+                                    ) and (400 <= archived_statuscode[i] <= 511):
+                                        components.iframe(
+                                            archived_tweet_url[i],
+                                            height=500,
+                                            scrolling=True,
+                                        )
+                                    elif "/status/" not in original_tweet_url[i]:
+                                        st.info(
+                                            "This isn't a status or is not available"
+                                        )
+                                    elif (
+                                        check_double_status(
+                                            archived_tweet_url[i], original_tweet_url[i]
+                                        )
+                                        or f"{st.session_state.current_username}"
+                                        not in original_tweet_url[i]
+                                    ):
+                                        st.info(
+                                            f"Replying to {st.session_state.current_username}"  # noqa: E501
+                                        )
+                                    else:
+                                        components.iframe(
+                                            archived_tweet_url[i],
+                                            height=500,
+                                            scrolling=True,
+                                        )
+
+                                    st.divider()
+
+                                # Display tweets not available with application/json return # noqa: E501
+                                elif (
+                                    archived_mimetype[i] == "application/json"
+                                    and not available_tweet_text[i]
+                                ):
+                                    st.code(parsed_tweet_text_mimetype_json[i])
+                                    # st.json(json_data, expanded=False)
+
+                                    st.divider()
+
+                                progress.write(
+                                    f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}"  # noqa: E501
+                                )
+                        pass
+                    except IndexError:
+                        if start_index <= 0:
+                            st.session_state.prev_disabled = True
+                        else:
+                            st.session_state.prev_disabled = False
+
+                        st.session_state.next_disabled = True
+
+            prev, _, next = st.columns([3, 4, 3])
+
+            prev.button(
+                "Previous",
+                disabled=st.session_state.prev_disabled,
+                key="prev_button_key",
+                on_click=prev_page,
+                type="primary",
+                use_container_width=True,
+            )
+            next.button(
+                "Next",
+                disabled=st.session_state.next_disabled,
+                key="next_button_key",
+                on_click=next_page,
+                type="primary",
+                use_container_width=True,
+            )
+
+        if not archived_tweets:
+            st.error("Unable to query the Wayback Machine API.")
+    except TypeError as e:
+        st.error(
+            f"""
+        {e}. Refresh this page and try again.
+
+        If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues)."""  # noqa: E501
+        )
+        st.session_state.offset = 0
diff --git a/app/new_app.py b/app/new_app.py
deleted file mode 100644 (file)
index 6f3eabf..0000000
+++ /dev/null
@@ -1,427 +0,0 @@
-import datetime
-
-import requests
-import streamlit as st
-import streamlit.components.v1 as components
-
-from waybacktweets.export_tweets import TweetsExporter
-from waybacktweets.parse_tweets import TweetsParser
-from waybacktweets.request_tweets import WaybackTweets
-from waybacktweets.utils import check_double_status
-
-# Initial Settings
-
-LOGO = "app/assets/parthenon.svg"
-
-st.set_page_config(
-    page_title="Wayback Tweets",
-    page_icon=LOGO,
-    layout="centered",
-    menu_items={
-        "Report a bug": "https://github.com/claromes/waybacktweets/issues",
-    },
-)
-
-# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
-st.html(
-    """
-<style>
-    header[data-testid="stHeader"] {
-        opacity: 0.5;
-    }
-     iframe {
-        border: 1px solid #dddddd;
-        border-radius: 0.5rem;
-    }
-    div[data-testid="InputInstructions"] {
-        visibility: hidden;
-    }
-    img[data-testid="stLogo"] {
-        scale: 3;
-        padding-left: 10px;
-    }
-</style>
-"""
-)
-
-if "current_username" not in st.session_state:
-    st.session_state.current_username = ""
-
-if "prev_disabled" not in st.session_state:
-    st.session_state.prev_disabled = False
-
-if "next_disabled" not in st.session_state:
-    st.session_state.next_disabled = False
-
-if "next_button" not in st.session_state:
-    st.session_state.next_button = False
-
-if "prev_button" not in st.session_state:
-    st.session_state.prev_button = False
-
-if "update_component" not in st.session_state:
-    st.session_state.update_component = 0
-
-if "offset" not in st.session_state:
-    st.session_state.offset = 0
-
-if "count" not in st.session_state:
-    st.session_state.count = False
-
-start_date = datetime.datetime(2006, 3, 1)
-end_date = datetime.datetime.now()
-
-if "archived_timestamp_filter" not in st.session_state:
-    st.session_state.archived_timestamp_filter = (start_date, end_date)
-
-
-# Pagination Settings
-
-
-def scroll_into_view():
-    script = f"""
-    <script>
-        window.parent.document.querySelector('section.main').scrollTo(0, 0);
-        let update_component = {st.session_state.update_component}
-    </script>
-    """
-
-    components.html(script, width=0, height=0)
-
-
-def prev_page():
-    st.session_state.offset -= tweets_per_page
-
-    st.session_state.update_component += 1
-    scroll_into_view()
-
-
-def next_page():
-    st.session_state.offset += tweets_per_page
-
-    st.session_state.update_component += 1
-    scroll_into_view()
-
-
-# Requesting
-
-
-@st.cache_data(ttl=1800, show_spinner=False)
-def tweets_count(username, archived_timestamp_filter):
-    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}"  # noqa: E501
-
-    try:
-        response = requests.get(url)
-
-        if response.status_code == 200:
-            data = response.json()
-            if data and len(data) > 1:
-                total_tweets = len(data) - 1
-                return total_tweets
-            else:
-                return 0
-    except requests.exceptions.Timeout:
-        st.error("Connection to web.archive.org timed out.")
-        st.stop()
-    except requests.exceptions.ConnectionError:
-        st.error("Failed to establish a new connection with web.archive.org.")
-        st.stop()
-    except UnboundLocalError:
-        st.empty()
-
-
-# Interface Settings
-
-
-st.logo(LOGO)
-
-st.success(
-    """**New Feature: CLI**
-
-You can now retrieve archived tweets using the Wayback Tweets command line tool.
-Download the archived tweets' CDX data in CSV, JSON, and HTML formats.
-
-For more details, [read the documentation](https://github.com/claromes/waybacktweets)."""  # noqa: E501
-)
-
-st.title(
-    "Wayback Tweets",
-    anchor=False,
-)
-st.caption(
-    "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)"  # noqa: E501
-)
-st.caption("Display multiple archived tweets on Wayback Machine")
-
-username = st.text_input("Username", placeholder="Without @")
-
-start_date = datetime.datetime(2006, 3, 1)
-end_date = datetime.datetime.now()
-
-st.session_state.archived_timestamp_filter = st.date_input(
-    "Tweets saved between",
-    (start_date, end_date),
-    start_date,
-    end_date,
-    format="YYYY/MM/DD",
-    help="YYYY/MM/DD",
-)
-
-not_available = st.checkbox("Only tweets not available")
-
-unique = st.checkbox(
-    "Only unique URLs",
-    help="Filtering by the collapse option using the urlkey field",
-)
-
-query = st.button("Query", type="primary", use_container_width=True)
-
-# Tweet Listing Settings
-
-
-if username != st.session_state.current_username:
-    st.session_state.current_username = username
-    st.session_state.offset = 0
-
-if query or st.session_state.count:
-    tweets_per_page = 25
-
-    st.session_state.count = tweets_count(
-        username, st.session_state.archived_timestamp_filter
-    )
-
-    st.caption(
-        "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
-    )
-    st.write(f"**{st.session_state.count} URLs have been captured**")
-
-    if st.session_state.count:
-        if tweets_per_page > st.session_state.count:
-            tweets_per_page = st.session_state.count
-
-    try:
-        progress = st.empty()
-
-        # Tweet Listing Processing
-
-        response = WaybackTweets(
-            username,
-            unique,
-            st.session_state.archived_timestamp_filter[0],
-            st.session_state.archived_timestamp_filter[1],
-            tweets_per_page,
-        )
-        archived_tweets = response.get()
-
-        with st.spinner("Parsing..."):
-            if archived_tweets:
-                field_options = [
-                    "archived_urlkey",
-                    "archived_timestamp",
-                    "original_tweet_url",
-                    "archived_tweet_url",
-                    "parsed_tweet_url",
-                    "parsed_archived_tweet_url",
-                    "parsed_tweet_text_mimetype_json",
-                    "available_tweet_text",
-                    "available_tweet_is_RT",
-                    "available_tweet_info",
-                    "archived_mimetype",
-                    "archived_statuscode",
-                ]
-
-                parser = TweetsParser(archived_tweets, username, field_options)
-                parsed_tweets = parser.parse()
-
-                exporter = TweetsExporter(parsed_tweets, username, field_options)
-                df = exporter.dataframe
-
-                # file_path = "claromes_tweets_20240610210338.csv"
-                # df = pd.read_csv(file_path)
-                # df = df.fillna("")
-
-                archived_urlkey = df["archived_urlkey"]
-                archived_timestamp = df["archived_timestamp"]
-                original_tweet_url = df["original_tweet_url"]
-                archived_tweet_url = df["archived_tweet_url"]
-                parsed_tweet_url = df["parsed_tweet_url"]
-                parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
-                parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"]
-                available_tweet_text = df["available_tweet_text"]
-                available_tweet_is_RT = df["available_tweet_is_RT"]
-                available_tweet_info = df["available_tweet_info"]
-                archived_mimetype = df["archived_mimetype"]
-                archived_statuscode = df["archived_statuscode"]
-
-                st.divider()
-
-                st.session_state.current_username = username
-
-                return_none_count = 0
-
-                start_index = st.session_state.offset
-                end_index = min(st.session_state.count, start_index + tweets_per_page)
-
-                for i in range(tweets_per_page):
-                    try:
-                        if original_tweet_url[i]:
-
-                            # Display all tweets
-                            if not not_available:
-                                st.markdown(
-                                    f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
-                                )
-
-                                # Display available tweets
-                                if available_tweet_text[i]:
-                                    if available_tweet_is_RT[i]:
-                                        st.info("*Retweet*")
-
-                                    st.write(available_tweet_text[i])
-                                    st.write(f"**{available_tweet_info[i]}**")
-
-                                    st.divider()
-
-                                # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
-                                elif (
-                                    archived_mimetype[i] != "application/json"
-                                    and not available_tweet_text[i]
-                                ):
-                                    if (
-                                        ".jpg" in original_tweet_url[i]
-                                        or ".png" in original_tweet_url[i]
-                                    ) and (400 <= archived_statuscode[i] <= 511):
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-                                    elif "/status/" not in original_tweet_url[i]:
-                                        st.info(
-                                            "This isn't a status or is not available"
-                                        )
-                                    elif (
-                                        check_double_status(
-                                            archived_tweet_url[i], original_tweet_url[i]
-                                        )
-                                        or f"{st.session_state.current_username}"
-                                        not in original_tweet_url[i]
-                                    ):
-                                        st.info(
-                                            f"Replying to {st.session_state.current_username}"  # noqa: E501
-                                        )
-                                    else:
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-
-                                    st.divider()
-
-                                # Display tweets not available with application/json return # noqa: E501
-                                elif (
-                                    archived_mimetype[i] == "application/json"
-                                    and not available_tweet_text[i]
-                                ):
-                                    st.code(parsed_tweet_text_mimetype_json[i])
-                                    # st.json(json_data, expanded=False)
-
-                                    st.divider()
-
-                            # Display only tweets not available
-                            if not_available:
-                                return_none_count += 1
-
-                                st.markdown(
-                                    f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
-                                )
-
-                                # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
-                                if (
-                                    archived_mimetype[i] != "application/json"
-                                    and not available_tweet_text[i]
-                                ):
-                                    if (
-                                        ".jpg" in original_tweet_url[i]
-                                        or ".png" in original_tweet_url[i]
-                                    ) and (400 <= archived_statuscode[i] <= 511):
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-                                    elif "/status/" not in original_tweet_url[i]:
-                                        st.info(
-                                            "This isn't a status or is not available"
-                                        )
-                                    elif (
-                                        check_double_status(
-                                            archived_tweet_url[i], original_tweet_url[i]
-                                        )
-                                        or f"{st.session_state.current_username}"
-                                        not in original_tweet_url[i]
-                                    ):
-                                        st.info(
-                                            f"Replying to {st.session_state.current_username}"  # noqa: E501
-                                        )
-                                    else:
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-
-                                    st.divider()
-
-                                # Display tweets not available with application/json return # noqa: E501
-                                elif (
-                                    archived_mimetype[i] == "application/json"
-                                    and not available_tweet_text[i]
-                                ):
-                                    st.code(parsed_tweet_text_mimetype_json[i])
-                                    # st.json(json_data, expanded=False)
-
-                                    st.divider()
-
-                                progress.write(
-                                    f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}"  # noqa: E501
-                                )
-                        pass
-                    except IndexError:
-                        if start_index <= 0:
-                            st.session_state.prev_disabled = True
-                        else:
-                            st.session_state.prev_disabled = False
-
-                        st.session_state.next_disabled = True
-
-            prev, _, next = st.columns([3, 4, 3])
-
-            prev.button(
-                "Previous",
-                disabled=st.session_state.prev_disabled,
-                key="prev_button_key",
-                on_click=prev_page,
-                type="primary",
-                use_container_width=True,
-            )
-            next.button(
-                "Next",
-                disabled=st.session_state.next_disabled,
-                key="next_button_key",
-                on_click=next_page,
-                type="primary",
-                use_container_width=True,
-            )
-
-        if not archived_tweets:
-            st.error("Unable to query the Wayback Machine API.")
-    except TypeError as e:
-        st.error(
-            f"""
-        {e}. Refresh this page and try again.
-
-        If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues)."""  # noqa: E501
-        )
-        st.session_state.offset = 0
index ebaebc8d104e068b7ef14b03058053485fa2d1bf..23596ecc6d436422ee238108187d78365260f37e 100644 (file)
@@ -5,6 +5,7 @@ CLI functions for retrieving archived tweets.
 from datetime import datetime
 
 import click
+from requests import exceptions
 from rich import print as rprint
 
 from waybacktweets.export_tweets import TweetsExporter
@@ -83,7 +84,7 @@ def cli(username, unique, timestamp_from, timestamp_to, limit):
             exporter.save_to_json()
             exporter.save_to_html()
 
-    except TypeError as e:
+    except exceptions as e:
         rprint(f"[red]{e}")
     finally:
         rprint(
index 76ad899386e9e81ca16e8be54f42dd8e03d6b91b..f182d244214bc0cfc9948ae38987a3d3af3d1237 100644 (file)
@@ -1,9 +1,8 @@
 import re
-import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import unquote
 
-import requests
+from requests import exceptions
 from rich import print as rprint
 from rich.progress import Progress
 
@@ -12,6 +11,7 @@ from waybacktweets.utils import (
     check_pattern_tweet,
     clean_tweet_url,
     delete_tweet_pathnames,
+    get_response,
     semicolon_parser,
 )
 
@@ -26,7 +26,7 @@ class TwitterEmbed:
         """Parses the archived tweets when they are still available."""
         try:
             url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
-            response = requests.get(url)
+            response = get_response(url=url)
 
             if response:
                 json_response = response.json()
@@ -62,7 +62,7 @@ class TwitterEmbed:
                         is_RT.append(author_name != author_tweet)
 
                 return tweet_content, is_RT, user_info
-        except Exception:
+        except exceptions:
             rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
             return None
 
@@ -75,18 +75,8 @@ class JsonParser:
 
     def parse(self):
         """Parses the archived tweets in JSON format."""
-
-        max_attempts = 5
         try:
-            for attempt in range(max_attempts):
-                try:
-                    response = requests.get(self.archived_tweet_url)
-                    break
-                except requests.exceptions.ConnectionError:
-                    if attempt < max_attempts - 1:
-                        time.sleep(0.5)
-                    else:
-                        raise
+            response = get_response(url=self.archived_tweet_url)
 
             if response:
                 json_data = response.json()
@@ -100,10 +90,13 @@ class JsonParser:
                     )
 
                 return json_data.get("text", json_data)
-        except Exception:
+        except exceptions.ConnectionError:
             rprint(
-                f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
+                f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
             )
+            return ""
+        except exceptions:
+            rprint("[yellow]Error parsing the JSON, but the CDX data was saved.")
 
             return ""
 
@@ -199,7 +192,7 @@ class TweetsParser:
                     try:
                         future.result()
                     except Exception as e:
-                        rprint(f"[red]{e}")
+                        rprint(f"[red]{e}...")
 
                     progress.update(task, advance=1)
 
index 009362990f178c9b2a46dd9a032ea1e0f7e7255c..b6c561e47313f884693a1120a93031ab31b7c27c 100644 (file)
@@ -1,6 +1,8 @@
-import requests
+from requests import exceptions
 from rich import print as rprint
 
+from waybacktweets.utils import get_response
+
 
 class WaybackTweets:
     """Requests data from the Wayback CDX Server API and returns it in JSON format."""
@@ -35,15 +37,17 @@ class WaybackTweets:
         print("Making a request to the Internet Archive...")
 
         try:
-            response = requests.get(url, params=params)
+            response = get_response(url=url, params=params)
 
             if response:
                 return response.json()
-        except requests.exceptions.ReadTimeout:
+        except exceptions.ReadTimeout:
             rprint("[red]Connection to web.archive.org timed out.")
-        except requests.exceptions.ConnectionError:
-            rprint("[red]Failed to establish a new connection with web.archive.org.")
-        except requests.exceptions.HTTPError:
+        except exceptions.ConnectionError:
+            rprint(
+                "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded."  # noqa: E501
+            )
+        except exceptions.HTTPError:
             rprint(
                 "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information."  # noqa: E501
             )
index 822a5ddd9dc2a5168c084902ab0b6fc0da516497..65c74a219c176eab00df191bb08004b49f0962b8 100644 (file)
@@ -4,6 +4,28 @@ Helper functions.
 
 import re
 
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+
+def get_response(url, params=None):
+    """Sends a GET request to the specified URL and returns the response."""
+    session = requests.Session()
+    retry = Retry(connect=3, backoff_factor=0.3)
+    adapter = HTTPAdapter(max_retries=retry)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"  # noqa: E501
+    }
+
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+
+    response = session.get(url, params=params, headers=headers)
+
+    if not 400 <= response.status_code <= 511:
+        return response
+
 
 def clean_tweet_url(tweet_url, username):
     """
index 1e803faba680415a86b289b2007b8903a8d839ae..5434980a973d594e8f3cc8f210d63a9138026f78 100644 (file)
@@ -18,6 +18,7 @@ class HTMLTweetsVisualizer:
 
     def generate(self):
         """Generates an HTML file."""
+
         html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
         html += "<style>\n"
         html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
@@ -38,7 +39,16 @@ class HTMLTweetsVisualizer:
             html += '<div class="tweet">\n'
 
             if (
-                tweet["archived_mimetype"] != "application/json"
+                (
+                    tweet["archived_mimetype"] != "application/json"
+                    and not tweet["parsed_tweet_text_mimetype_json"]
+                )
+                and not tweet["available_tweet_text"]
+            ) or (
+                (
+                    tweet["archived_mimetype"] == "application/json"
+                    and not tweet["parsed_tweet_text_mimetype_json"]
+                )
                 and not tweet["available_tweet_text"]
             ):
                 html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
@@ -54,7 +64,10 @@ class HTMLTweetsVisualizer:
                 html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
                 html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
 
-            if tweet["archived_mimetype"] == "application/json":
+            if (
+                tweet["archived_mimetype"] == "application/json"
+                and tweet["parsed_tweet_text_mimetype_json"]
+            ) and not tweet["available_tweet_text"]:
                 html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'
 
             html += "<br>\n"