update strealit app, add legacy streamlit app, update viz module and export module...

author Claromes <claromes@hey.com>

Sun, 23 Jun 2024 03:46:05 +0000 (00:46 -0300)

committer Claromes <claromes@hey.com>

Sun, 23 Jun 2024 03:46:05 +0000 (00:46 -0300)
author Claromes <claromes@hey.com>
Sun, 23 Jun 2024 03:46:05 +0000 (00:46 -0300)
committer Claromes <claromes@hey.com>
Sun, 23 Jun 2024 03:46:05 +0000 (00:46 -0300)
diff --git a/.streamlit/config.toml b/.streamlit/config.toml

index 3b1df1ae161f814fa86babcba7a467ab8e2b86d5..cb97cf1cfba68e50bb7ac2b8d0311c757fbe65f8 100644 (file)
--- a/.streamlit/config.toml
+++ b/.streamlit/config.toml
@@ -4,7 +4,7 @@ primaryColor = "black"
  secondaryBackgroundColor = "gainsboro"
  textColor = "black"
  backgroundColor = "whitesmoke"
-font = "sans serif"
+font = "serif"
  
  [client]
  toolbarMode = "minimal"
diff --git a/README.md b/README.md

index 96fe6c061ba57d310eb5349043603954c830ba32..bb3f763dcd4de191d2261842afd2fbfb4f9b7349 100644 (file)
--- a/README.md
+++ b/README.md
@@ -59,3 +59,6 @@ if archived_tweets:
  - Tristan Lee (Bellingcat's Data Scientist) for the idea of the application.
  - Jessica Smith (Snowflake's Marketing Specialist) and Streamlit/Snowflake teams for the additional server resources on Streamlit Cloud.
  - OSINT Community for recommending the application.
+
+> [!NOTE]
+> If the application is down, please check the [Streamlit Cloud Status](https://www.streamlitstatus.com/).
diff --git a/app/app.py b/app/app.py

index d8ea1a51a5c69be452b44a65458ebad565a0fa9e..8a209d51257010d78989ad13b7c904652f456771 100644 (file)
--- a/app/app.py
+++ b/app/app.py
@@ -1,43 +1,43 @@
+import base64
  from datetime import datetime
  
  import streamlit as st
-import streamlit.components.v1 as components
  
  from waybacktweets.api.export import TweetsExporter
-from waybacktweets.api.parse import JsonParser, TweetsParser
+from waybacktweets.api.parse import TweetsParser
  from waybacktweets.api.request import WaybackTweets
-from waybacktweets.config.config import config
+from waybacktweets.api.visualize import HTMLTweetsVisualizer
+from waybacktweets.config import FIELD_OPTIONS, config
  
-# Initial Settings
+# ------ Initial Settings ------ #
  
-LOGO = "assets/parthenon.png"
+PAGE_ICON = "assets/parthenon.png"
  TITLE = "assets/waybacktweets.png"
-FIELD_OPTIONS = [
-    "parsed_archived_timestamp",
-    "archived_tweet_url",
-    "parsed_archived_tweet_url",
-    "original_tweet_url",
-    "parsed_tweet_url",
-    "available_tweet_text",
-    "available_tweet_is_RT",
-    "available_tweet_info",
-    "archived_mimetype",
-    "archived_statuscode",
-]
+PREVIEW_IMAGE = "assets/preview_image.jpg"
+DOWNLOAD = "assets/download.svg"
+
+collapse = None
+matchtype = None
+start_date = datetime(2006, 1, 1)
+end_date = datetime.now()
+
+# ------ Verbose Mode Configuration ------ #
+
+config.verbose = True
+
+# ------ Page Configuration ------ #
  
  st.set_page_config(
      page_title="Wayback Tweets",
-    page_icon=LOGO,
+    page_icon=PAGE_ICON,
      layout="centered",
      menu_items={
          "About": f"""
      [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)
  
-    Application that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
+    The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and download the data in different formats.
  
-    The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html).
-
-    © 2023 - {datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License
+    © 2023 - {end_date.year}, [Claromes](https://claromes.com)
  
      ---
  """,  # noqa: E501
@@ -45,99 +45,43 @@ st.set_page_config(
      },
  )
  
-# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
-st.html(
-    """
-<style>
-    header[data-testid="stHeader"] {
-        opacity: 0.5;
-    }
-     iframe {
-        border: 1px solid #dddddd;
-        border-radius: 0.5rem;
-    }
-    div[data-testid="InputInstructions"] {
-        visibility: hidden;
-    }
-    img[data-testid="stLogo"] {
-        scale: 4;
-        padding-left: 10px;
-    }
-    button[data-testid="StyledFullScreenButton"] {
-        display: none;
-    }
-</style>
-"""
-)
+# ------ Set States ------ #
  
  if "current_username" not in st.session_state:
      st.session_state.current_username = ""
  
-if "prev_disabled" not in st.session_state:
-    st.session_state.prev_disabled = False
-
-if "next_disabled" not in st.session_state:
-    st.session_state.next_disabled = False
-
-if "next_button" not in st.session_state:
-    st.session_state.next_button = False
-
-if "prev_button" not in st.session_state:
-    st.session_state.prev_button = False
-
-if "update_component" not in st.session_state:
-    st.session_state.update_component = 0
-
-if "offset" not in st.session_state:
-    st.session_state.offset = 0
-
  if "count" not in st.session_state:
      st.session_state.count = False
  
-start_date = datetime(2006, 1, 1)
-end_date = datetime.now()
-
  if "archived_timestamp_filter" not in st.session_state:
      st.session_state.archived_timestamp_filter = (start_date, end_date)
  
+# ------ Add Custom CSS Style ------ #
  
-# Verbose mode configuration
-
-config.verbose = True
-
-
-# Pagination Settings
-
-
-def scroll_into_view():
-    script = f"""
-    <script>
-        window.parent.document.querySelector('section.main').scrollTo(0, 0);
-        let update_component = {st.session_state.update_component}
-    </script>
+st.html(
      """
+    <style>
+        header[data-testid="stHeader"] {
+            opacity: 0.5;
+        }
+        iframe {
+            border: 1px solid #dddddd;
+            border-radius: 0.5rem;
+        }
+        div[data-testid="InputInstructions"] {
+            visibility: hidden;
+        }
+        button[data-testid="StyledFullScreenButton"] {
+            display: none;
+        }
+    </style>
+    """
+)
  
-    components.html(script, width=0, height=0)
-
-
-def prev_page():
-    st.session_state.offset -= tweets_per_page
-
-    st.session_state.update_component += 1
-    scroll_into_view()
-
-
-def next_page():
-    st.session_state.offset += tweets_per_page
-
-    st.session_state.update_component += 1
-    scroll_into_view()
-
-
-# Requesting
+# ------ Requestings ------ #
  
  
-@st.cache_data(ttl=1800, show_spinner=False)
+@st.cache_data(ttl=600, show_spinner=True)
  def wayback_tweets(
      username,
      collapse,
@@ -161,301 +105,205 @@ def wayback_tweets(
      return archived_tweets
  
  
-@st.cache_data(ttl=1800, show_spinner=False)
-def tweets_parser(archived_tweets, field_options):
+@st.cache_data(ttl=600, show_spinner=True)
+def tweets_parser(archived_tweets, username, field_options):
      parser = TweetsParser(archived_tweets, username, field_options)
      parsed_tweets = parser.parse()
  
      return parsed_tweets
  
  
-@st.cache_data(ttl=1800, show_spinner=False)
+@st.cache_data(ttl=600, show_spinner=True)
  def tweets_exporter(parsed_tweets, username, field_options):
      exporter = TweetsExporter(parsed_tweets, username, field_options)
  
      df = exporter.dataframe
+    file_name = exporter.filename
  
-    return df
+    return df, file_name
  
  
-@st.cache_data(ttl=1800, show_spinner=False)
-def tweets_json_parser():
-    if archived_mimetype[i] == "application/json":
-        json_parser = JsonParser(parsed_archived_tweet_url[i])
-        text_json = json_parser.parse()
+# ------ User Interface Settings ------ #
  
-        if text_json:
-            return text_json
+st.info(
+    """🥳 [**Pre-release 1.0x: New Streamlit app, CLI and Python module**](https://claromes.github.io/waybacktweets)"""  # noqa: E501
+)
  
-        return None
+st.image(TITLE, use_column_width="never")
+st.caption(
+    "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)"  # noqa: E501
+)
+st.write("Retrieve archived tweets CDX data in CSV, JSON, and HTML formats.")
  
+st.caption(
+    "This application uses the Wayback Tweets Python package, which can be used as a module or as a standalone command line tool. [Read the documentation](https://claromes.github.io/waybacktweets)."  # noqa: E501
+)
  
-def display_tweet_header():
-    header = st.markdown(
-        f"[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {parsed_archived_timestamp[i]} · **archived status code:** {archived_statuscode[i]}"  # noqa: E501
-    )
+st.caption(
+    "To access the legacy version of Wayback Tweets [click here](https://waybacktweets-legacy.streamlit.app)."  # noqa: E501
+)
+
+st.divider()
  
-    return header
+# -- Filters -- #
  
+username = st.text_input("Username *", key="username", placeholder="Without @")
  
-def display_tweet_iframe():
-    tweet_iframe = components.iframe(
-        archived_tweet_url[i],
-        height=500,
-        scrolling=True,
+with st.expander("Filtering"):
+    start_date = datetime(2006, 1, 1)
+    end_date = datetime.now()
+
+    st.session_state.archived_timestamp_filter = st.date_input(
+        "Tweets saved between",
+        (start_date, end_date),
+        start_date,
+        end_date,
+        format="YYYY/MM/DD",
+        help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
      )
  
-    return tweet_iframe
+    col1, col2 = st.columns(2)
+
+    with col1:
+        limit = st.text_input(
+            "Limit",
+            key="limit",
+            help="Query result limits",
+        )
  
+    with col2:
+        offset = st.text_input(
+            "Offset",
+            key="offset",
+            help="Allows for a simple way to scroll through the results",
+        )
  
-# Interface Settings
+    col3, col4 = st.columns(2)
  
-st.logo(LOGO)
+    with col3:
+        not_available = st.checkbox(
+            "Only tweets not available",
+            key="not_available",
+            help="Checks if the archived URL still exists on Twitter",
+        )
  
-st.success(
-    """**v1.0 🎉: CLI and Python Module**
+    with col4:
+        unique = st.checkbox(
+            "Only unique Wayback Machine URLs",
+            key="unique",
+            help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`",  # noqa: E501
+        )
  
-$ `pip install waybacktweets`
  
-$ `waybacktweets --from 20150101 --to 20191231 --limit 250 jack`
+query = st.button("Query", type="primary", use_container_width=True)
  
-Retrieve archived tweets CDX data in CSV, JSON, and HTML formats using the command line.
+# ------ Results ------ #
  
-Read the documentation: [claromes.github.io/waybacktweets](https://claromes.github.io/waybacktweets)."""  # noqa: E501
-)
+if username != st.session_state.current_username:
+    st.session_state.current_username = username
  
-st.image(TITLE, use_column_width="never")
-st.caption(
-    "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)"  # noqa: E501
-)
-st.caption("Display multiple archived tweets on Wayback Machine.")
-st.caption(
-    "Download data via command line with the [`waybacktweets`](https://pypi.org/project/waybacktweets) Python package."  # noqa: E501
-)
+if query or st.session_state.count:
+    if unique:
+        collapse = "urlkey"
+        matchtype = "prefix"
  
-username = st.text_input("Username", placeholder="Without @")
+    try:
+        wayback_tweets = wayback_tweets(
+            st.session_state.current_username,
+            collapse,
+            st.session_state.archived_timestamp_filter[0],
+            st.session_state.archived_timestamp_filter[1],
+            limit,
+            offset,
+            matchtype,
+        )
  
-start_date = datetime(2006, 1, 1)
-end_date = datetime.now()
+        if not wayback_tweets:
+            st.error("No data was saved due to an empty response.")
+            st.stop()
  
-st.session_state.archived_timestamp_filter = st.date_input(
-    "Tweets saved between",
-    (start_date, end_date),
-    start_date,
-    end_date,
-    format="YYYY/MM/DD",
-    help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
-)
+        parsed_tweets = tweets_parser(
+            wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
+        )
  
-not_available = st.checkbox(
-    "Only tweets not available",
-    help="Checks if the archived URL still exists on Twitter",
-)
+        df, file_name = tweets_exporter(
+            parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
+        )
  
-unique = st.checkbox(
-    "Only unique Wayback Machine URLs",
-    help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`",  # noqa: E501
-)
+        csv_data = df.to_csv(index=False)
+        json_data = df.to_json(orient="records", lines=False)
+        html = HTMLTweetsVisualizer(username, json_data)
+        html_content = html.generate()
  
-query = st.button("Query", type="primary", use_container_width=True)
+        st.session_state.count = len(df)
+        st.write(f"**{st.session_state.count} URLs have been captured**")
  
-# Tweet Listing Settings
+        # -- HTML -- #
  
-if username != st.session_state.current_username:
-    st.session_state.current_username = username
-    st.session_state.offset = 0
+        st.header("HTML", divider="gray")
+        st.write(
+            f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML."  # noqa: E501
+        )
  
-if query or st.session_state.count:
-    tweets_per_page = 25
+        col5, col6 = st.columns([1, 18])
  
-    collapse = None
-    matchType = None
+        with col5:
+            st.image(DOWNLOAD, width=22)
  
-    if unique:
-        collapse = "urlkey"
-        matchType = "prefix"
+        with col6:
+            b64_html = base64.b64encode(html_content.encode()).decode()
+            href_html = f"data:text/html;base64,{b64_html}"
  
-    try:
-        with st.spinner("Waybacking..."):
-            wayback_tweets = wayback_tweets(
-                st.session_state.current_username,
-                collapse,
-                st.session_state.archived_timestamp_filter[0],
-                st.session_state.archived_timestamp_filter[1],
-                tweets_per_page,
-                st.session_state.offset,
-                matchType,
+            st.markdown(
+                f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>',  # noqa: E501
+                unsafe_allow_html=True,
              )
  
-            parsed_tweets = tweets_parser(wayback_tweets, FIELD_OPTIONS)
-            df = tweets_exporter(
-                parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
-            )
+        st.image(PREVIEW_IMAGE, "Preview image")
  
-            st.session_state.count = len(df)
-
-            # st.caption(
-            #     "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
-            # )
-            # st.write(f"**{st.session_state.count} URLs have been captured**")
-
-            if st.session_state.count:
-                if tweets_per_page > st.session_state.count:
-                    tweets_per_page = st.session_state.count
-
-            # Tweet Listing Processing
-
-            progress = st.empty()
-
-            parsed_archived_timestamp = df["parsed_archived_timestamp"]
-            archived_tweet_url = df["archived_tweet_url"]
-            parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
-            original_tweet_url = df["original_tweet_url"]
-            parsed_tweet_url = df["parsed_tweet_url"]
-            available_tweet_text = df["available_tweet_text"]
-            available_tweet_is_RT = df["available_tweet_is_RT"]
-            available_tweet_info = df["available_tweet_info"]
-            archived_mimetype = df["archived_mimetype"]
-            archived_statuscode = df["archived_statuscode"]
-
-            st.divider()
-            st.session_state.current_username = username
-
-            return_none_count = 0
-
-            start_index = st.session_state.offset
-            end_index = min(st.session_state.count, start_index + tweets_per_page)
-
-            for i in range(tweets_per_page):
-                try:
-                    parsed_text_json = tweets_json_parser()
-
-                    # Display all tweets
-                    if not not_available:
-                        # Display available tweets
-                        if available_tweet_text[i]:
-                            display_tweet_header()
-
-                            if available_tweet_is_RT[i]:
-                                st.info("*Retweet*")
-
-                            st.write(available_tweet_text[i])
-                            st.write(f"**{available_tweet_info[i]}**")
-
-                            st.divider()
-                        # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501
-                        elif (
-                            archived_mimetype[i] != "application/json"
-                            and not available_tweet_text[i]
-                        ):
-                            display_tweet_header()
-                            if (
-                                ".jpg" in parsed_tweet_url[i]
-                                or ".png" in parsed_tweet_url[i]
-                            ) and (400 <= archived_statuscode[i] <= 511):
-                                display_tweet_iframe()
-                            elif "/status/" not in parsed_tweet_url[i]:
-                                st.info(
-                                    "This isn't a status or is not available"  # noqa: E501
-                                )
-                            elif (
-                                f"{st.session_state.current_username}"
-                                not in parsed_tweet_url[i]
-                            ):
-                                st.info(
-                                    f"Replying to {st.session_state.current_username}"  # noqa: E501
-                                )
-                            else:
-                                display_tweet_iframe()
-
-                            st.divider()
-                        # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501
-                        elif (
-                            archived_mimetype[i] == "application/json"
-                            and not available_tweet_text[i]
-                        ):
-                            display_tweet_header()
-                            st.code(parsed_text_json)
-
-                            st.divider()
-
-                    # Display only tweets not available
-                    if not_available:
-                        # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
-                        if (
-                            archived_mimetype[i] != "application/json"
-                            and not available_tweet_text[i]
-                        ):
-                            return_none_count += 1
-
-                            display_tweet_header()
-                            if (
-                                ".jpg" in parsed_tweet_url[i]
-                                or ".png" in parsed_tweet_url[i]
-                            ) and (400 <= archived_statuscode[i] <= 511):
-                                display_tweet_iframe()
-                            elif "/status/" not in parsed_tweet_url[i]:
-                                st.info(
-                                    "This isn't a status or is not available"  # noqa: E501
-                                )
-                            elif (
-                                f"{st.session_state.current_username}"
-                                not in parsed_tweet_url[i]
-                            ):
-                                st.info(
-                                    f"Replying to {st.session_state.current_username}"  # noqa: E501
-                                )
-                            else:
-                                display_tweet_iframe()
-
-                            st.divider()
-
-                        # Display tweets not available with application/json return # noqa: E501
-                        elif (
-                            archived_mimetype[i] == "application/json"
-                            and not available_tweet_text[i]
-                        ):
-                            return_none_count += 1
-
-                            display_tweet_header()
-                            st.code(parsed_text_json)
-
-                            st.divider()
-
-                        progress.write(
-                            f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}"  # noqa: E501
-                        )
-                except IndexError:
-                    if start_index <= 0:
-                        st.session_state.prev_disabled = True
-                    else:
-                        st.session_state.prev_disabled = False
-
-                    st.session_state.next_disabled = True
-
-            prev, _, next = st.columns([3, 4, 3])
-
-        prev.button(
-            "Previous",
-            disabled=st.session_state.prev_disabled,
-            key="prev_button_key",
-            on_click=prev_page,
-            type="primary",
-            use_container_width=True,
-        )
-        next.button(
-            "Next",
-            disabled=st.session_state.next_disabled,
-            key="next_button_key",
-            on_click=next_page,
-            type="primary",
-            use_container_width=True,
+        # -- CSV -- #
+
+        st.header("CSV", divider="gray")
+        st.write(
+            "Check the data returned in the dataframe below and download the file."
          )
  
-        if not wayback_tweets:
-            st.error(
-                "Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
+        col7, col8 = st.columns([1, 18])
+
+        with col7:
+            st.image(DOWNLOAD, width=22)
+
+        with col8:
+            b64_csv = base64.b64encode(csv_data.encode()).decode()
+            href_csv = f"data:file/csv;base64,{b64_csv}"
+
+            st.markdown(
+                f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>',  # noqa: E501
+                unsafe_allow_html=True,
+            )
+
+        st.dataframe(df, use_container_width=True)
+
+        # -- JSON -- #
+
+        st.header("JSON", divider="gray")
+        st.write("Check the data returned in JSON format below and download the file.")
+
+        col9, col10 = st.columns([1, 18])
+
+        with col9:
+            st.image(DOWNLOAD, width=22)
+
+        with col10:
+            b64_json = base64.b64encode(json_data.encode()).decode()
+            href_json = f"data:file/json;base64,{b64_json}"
+
+            st.markdown(
+                f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>',  # noqa: E501
+                unsafe_allow_html=True,
              )
+
+        st.json(json_data, expanded=False)
      except TypeError as e:
          st.error(
              f"""
@@ -463,7 +311,7 @@ if query or st.session_state.count:
  
          If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues)."""  # noqa: E501
          )
-        st.session_state.offset = 0
+        st.stop()
      except Exception as e:
-        st.error(f"{e}")
+        st.error(str(e))
          st.stop()
diff --git a/app/requirements.txt b/app/requirements.txt

index 7b81516aaba1692daefe4f5430351e1a4cffa53f..d09b089fcb72584f93e6c119004f0b6fd452bf37 100644 (file)
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -1,3 +1,2 @@
-requests>=2.30.0
  streamlit==1.35.0
  waybacktweets>=1.0
diff --git a/assets/download.svg b/assets/download.svg

new file mode 100644 (file)

index 0000000..959a8d7
--- /dev/null
+++ b/assets/download.svg
@@ -0,0 +1,9 @@
+<svg version="1.2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 170 272" width="170" height="272">
+       <title>upload-svg</title>
+       <style>
+               .s0 { opacity: .9;fill: none;stroke: #000000;stroke-linecap: round;stroke-linejoin: round;stroke-width: 16 } 
+       </style>
+       <path id="Layer" fill-rule="evenodd" class="s0" d="m15 258q35.5-3.7 70.7-3.7c35.3 0 63 3.7 70.3 3.7"/>
+       <path id="Layer" fill-rule="evenodd" class="s0" d="m84.3 164.8q-1.6-37.3-1.6-74.5c0-37.1 1.6-66.3 1.6-74"/>
+       <path id="Layer" fill-rule="evenodd" class="s0" d="m150 134c-4.7 19.7-52.6 64.5-57.6 76.4-5 11.8-62.3-39.4-72.4-62.9"/>
+</svg>
+\ No newline at end of file
diff --git a/assets/parthenon.png b/assets/parthenon.png

index 64222fbe835ffdef9d866201e939ceb72c96e494..191b5b5c473c653e904c50635a69e8c68da6b092 100644 (file)

Binary files a/assets/parthenon.png and b/assets/parthenon.png differ
diff --git a/assets/preview_image.jpg b/assets/preview_image.jpg

new file mode 100644 (file)

index 0000000..5adcfee

Binary files /dev/null and b/assets/preview_image.jpg differ
diff --git a/assets/waybacktweets.png b/assets/waybacktweets.png

index 937a666e7ff9e7f71035d891aff09287d35728e0..87c42b4f9e565efd92821ae5f6672f478db1a995 100644 (file)

Binary files a/assets/waybacktweets.png and b/assets/waybacktweets.png differ
diff --git a/legacy_app/.streamlit/config.toml b/legacy_app/.streamlit/config.toml

new file mode 100644 (file)

index 0000000..7eb8bbc
--- /dev/null
+++ b/legacy_app/.streamlit/config.toml
@@ -0,0 +1,11 @@
+[theme]
+base = "light"
+primaryColor = "#ab2e33"
+secondaryBackgroundColor = "#efefef"
+textColor = "#000000"
+backgroundColor = "#f9f9f9"
+font = "serif"
+
+[client]
+displayEnabled = true
+toolbarMode = "minimal"
diff --git a/legacy_app/legacy_app.py b/legacy_app/legacy_app.py

new file mode 100644 (file)

index 0000000..82059a2
--- /dev/null
+++ b/legacy_app/legacy_app.py
@@ -0,0 +1,525 @@
+import datetime
+import re
+from urllib.parse import unquote
+
+import requests
+import streamlit as st
+import streamlit.components.v1 as components
+
+year = datetime.datetime.now().year
+
+st.set_page_config(
+    page_title="Wayback Tweets",
+    page_icon="🏛️",
+    layout="centered",
+    menu_items={
+        "About": """
+        ## 🏛️ Wayback Tweets
+
+        Tool that displays, via Wayback CDX Server API, multiple archived tweets on Wayback Machine to avoid opening each link manually. Users can apply filters based on specific years and view tweets that do not have the original URL available.
+
+        This tool is a prototype, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues). Created by [@claromes](https://claromes.com).
+
+        -------
+        """,  # noqa: E501
+    },
+)
+
+# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
+hide_streamlit_style = """
+<style>
+    header[data-testid="stHeader"] {
+        opacity: 0.5;
+    }
+     iframe {
+        border: 1px solid #dddddd;
+        border-radius: 0.5rem;
+    }
+    div[data-testid="InputInstructions"] {
+        visibility: hidden;
+    }
+</style>
+"""
+
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+
+if "current_handle" not in st.session_state:
+    st.session_state.current_handle = ""
+
+if "prev_disabled" not in st.session_state:
+    st.session_state.prev_disabled = False
+
+if "next_disabled" not in st.session_state:
+    st.session_state.next_disabled = False
+
+if "next_button" not in st.session_state:
+    st.session_state.next_button = False
+
+if "prev_button" not in st.session_state:
+    st.session_state.prev_button = False
+
+if "update_component" not in st.session_state:
+    st.session_state.update_component = 0
+
+if "offset" not in st.session_state:
+    st.session_state.offset = 0
+
+if "saved_at" not in st.session_state:
+    st.session_state.saved_at = (2006, year)
+
+if "count" not in st.session_state:
+    st.session_state.count = False
+
+
+def scroll_into_view():
+    js = f"""
+    <script>
+        window.parent.document.querySelector('section.main').scrollTo(0, 0);
+        let update_component = {st.session_state.update_component} // Force component update to generate scroll
+    </script>
+    """  # noqa: E501
+
+    components.html(js, width=0, height=0)
+
+
+def clean_tweet(tweet):
+    handle = st.session_state.current_handle.lower()
+    tweet_lower = tweet.lower()
+
+    pattern = re.compile(r"/status/(\d+)")
+    match_lower_case = pattern.search(tweet_lower)
+    match_original_case = pattern.search(tweet)
+
+    if match_lower_case and handle in tweet_lower:
+        return f"https://twitter.com/{st.session_state.current_handle}/status/{match_original_case.group(1)}"  # noqa: E501
+    else:
+        return tweet
+
+
+def clean_link(link):
+    handle = st.session_state.current_handle.lower()
+    link = link.lower()
+
+    pattern = re.compile(r"/status/(\d+)")
+    match = pattern.search(link)
+
+    if match and handle in link:
+        return f"https://web.archive.org/web/{timestamp[i]}/https://twitter.com/{st.session_state.current_handle}/status/{match.group(1)}"  # noqa: E501
+    else:
+        return link
+
+
+def pattern_tweet(tweet):
+    # Reply: /status//
+    # Link:  /status///
+    # Twimg: /status/https://pbs
+
+    pattern = re.compile(r'/status/"([^"]+)"')
+
+    match = pattern.search(tweet)
+    if match:
+        return match.group(1).lstrip("/")
+    else:
+        return tweet
+
+
+def pattern_tweet_id(tweet):
+    # Delete sub-endpoint (/photos, /likes, /retweet...)
+    pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
+    match_username = pattern_username.match(tweet)
+
+    pattern_id = r"https://twitter.com/\w+/status/(\d+)"
+    match_id = re.search(pattern_id, tweet)
+
+    if match_id and match_username:
+        tweet_id = match_id.group(1)
+        username = match_username.group(1)
+        return f"https://twitter.com/{username}/status/{tweet_id}"
+    else:
+        return tweet
+
+
+def check_double_status(url_wb, url_tweet):
+    if url_wb.count("/status/") == 2 and "twitter.com" not in url_tweet:
+        return True
+
+    return False
+
+
+def embed(tweet):
+    try:
+        url = f"https://publish.twitter.com/oembed?url={clean_tweet(tweet)}"
+        response = requests.get(url)
+
+        regex = r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>'  # noqa: E501
+        regex_author = r"^(.*?)\s*\("
+
+        if response.status_code == 200 or response.status_code == 302:
+            status_code = response.status_code
+            html = response.json()["html"]
+            author_name = response.json()["author_name"]
+
+            matches_html = re.findall(regex, html, re.DOTALL)
+
+            tweet_content = []
+            user_info = []
+            is_RT = []
+
+            for match in matches_html:
+                tweet_content_match = re.sub(r"<a[^>]*>|<\/a>", "", match[0].strip())
+                tweet_content_match = tweet_content_match.replace("<br>", "\n")
+
+                user_info_match = re.sub(r"<a[^>]*>|<\/a>", "", match[1].strip())
+                user_info_match = user_info_match.replace(")", "), ")
+
+                match_author = re.search(regex_author, user_info_match)
+                author_tweet = match_author.group(1)
+
+                if tweet_content_match:
+                    tweet_content.append(tweet_content_match)
+                if user_info_match:
+                    user_info.append(user_info_match)
+
+                    is_RT_match = False
+                    if author_name != author_tweet:
+                        is_RT_match = True
+
+                    is_RT.append(is_RT_match)
+
+            return status_code, tweet_content, user_info, is_RT
+        else:
+            return False
+    except requests.exceptions.Timeout:
+        st.error("Connection to web.archive.org timed out.")
+    except requests.exceptions.ConnectionError:
+        st.error("Failed to establish a new connection with web.archive.org.")
+    except UnboundLocalError:
+        st.empty()
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def tweets_count(handle, saved_at):
+    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&from={saved_at[0]}&to={saved_at[1]}"  # noqa: E501
+    try:
+        response = requests.get(url)
+
+        if response.status_code == 200:
+            data = response.json()
+            if data and len(data) > 1:
+                total_tweets = len(data) - 1
+                return total_tweets
+            else:
+                return 0
+    except requests.exceptions.Timeout:
+        st.error("Connection to web.archive.org timed out.")
+        st.stop()
+    except requests.exceptions.ConnectionError:
+        st.error("Failed to establish a new connection with web.archive.org.")
+    except UnboundLocalError:
+        st.empty()
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def query_api(handle, limit, offset, saved_at):
+    if not handle:
+        st.warning("username, please!")
+        st.stop()
+
+    url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&limit={limit}&offset={offset}&from={saved_at[0]}&to={saved_at[1]}"  # noqa: E501
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+
+        if response.status_code == 200 or response.status_code == 304:
+            return response.json()
+    except requests.exceptions.Timeout:
+        st.error("Connection to web.archive.org timed out.")
+    except requests.exceptions.ConnectionError:
+        st.error("Failed to establish a new connection with web.archive.org.")
+    except UnboundLocalError:
+        st.empty()
+    except requests.exceptions.HTTPError:
+        st.error(
+            """
+        **Temporarily Offline**
+
+        Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information.
+        """  # noqa: E501
+        )
+        st.stop()
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def parse_links(links):
+    parsed_links = []
+    timestamp = []
+    tweet_links = []
+    parsed_mimetype = []
+
+    for link in links[1:]:
+        tweet_remove_char = unquote(link[2]).replace("’", "")
+        cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"')
+
+        url = f"https://web.archive.org/web/{link[1]}/{tweet_remove_char}"
+
+        parsed_links.append(url)
+        timestamp.append(link[1])
+        tweet_links.append(cleaned_tweet)
+        parsed_mimetype.append(link[3])
+
+    return parsed_links, tweet_links, parsed_mimetype, timestamp
+
+
+def attr(i):
+    original_tweet = pattern_tweet_id(clean_tweet(tweet_links[i]))
+
+    if status:
+        original_tweet = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
+    elif "://" not in tweet_links[i]:
+        original_tweet = pattern_tweet_id(f"https://{tweet_links[i]}")
+
+    st.markdown(
+        f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}'  # noqa: E501
+    )
+
+
+def display_tweet():
+    if (
+        mimetype[i] == "application/json"
+        or mimetype[i] == "text/html"
+        or mimetype[i] == "unk"
+        or mimetype[i] == "warc/revisit"
+    ):
+        if is_RT[0] is True:
+            st.info("*Retweet*")
+        st.write(tweet_content[0])
+        st.write(f"**{user_info[0]}**")
+
+        st.divider()
+    else:
+        st.warning("MIME Type was not parsed.")
+
+        st.divider()
+
+
+def display_not_tweet():
+    original_link = pattern_tweet_id(clean_tweet(tweet_links[i]))
+
+    if status:
+        original_link = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
+    elif "://" not in tweet_links[i]:
+        original_link = pattern_tweet_id(f"https://{tweet_links[i]}")
+
+    response_html = requests.get(original_link)
+
+    if (
+        mimetype[i] == "text/html"
+        or mimetype[i] == "warc/revisit"
+        or mimetype[i] == "unk"
+    ):
+        if (
+            ".jpg" in tweet_links[i] or ".png" in tweet_links[i]
+        ) and response_html.status_code == 200:
+            components.iframe(tweet_links[i], height=500, scrolling=True)
+        elif "/status/" not in original_link:
+            st.info("This isn't a status or is not available")
+        elif status or f"{st.session_state.current_handle}" not in original_link:
+            st.info(f"Replying to {st.session_state.current_handle}")
+        else:
+            components.iframe(clean_link(link), height=500, scrolling=True)
+
+        st.divider()
+    elif mimetype[i] == "application/json":
+        try:
+            response_json = requests.get(link)
+
+            if response_json.status_code == 200:
+                json_data = response_json.json()
+
+                if "data" in json_data:
+                    if "text" in json_data["data"]:
+                        json_text = json_data["data"]["text"]
+                    else:
+                        json_text = json_data["data"]
+                else:
+                    if "text" in json_data:
+                        json_text = json_data["text"]
+                    else:
+                        json_text = json_data
+
+                st.code(json_text)
+                st.json(json_data, expanded=False)
+
+                st.divider()
+            else:
+                st.error(response_json.status_code)
+
+                st.divider()
+        except requests.exceptions.Timeout:
+            st.error("Connection to web.archive.org timed out.")
+            st.divider()
+        except requests.exceptions.ConnectionError:
+            st.error("Failed to establish a new connection with web.archive.org.")
+            st.divider()
+        except UnboundLocalError:
+            st.empty()
+    else:
+        st.warning("MIME Type was not parsed.")
+        st.divider()
+
+
+def prev_page():
+    st.session_state.offset -= tweets_per_page
+
+    # scroll to top config
+    st.session_state.update_component += 1
+    scroll_into_view()
+
+
+def next_page():
+    st.session_state.offset += tweets_per_page
+
+    # scroll to top config
+    st.session_state.update_component += 1
+    scroll_into_view()
+
+
+# UI
+st.title(
+    "Wayback Tweets [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)",  # noqa: E501
+    anchor=False,
+    help="v0.4.3",
+)
+st.write(
+    "Display multiple archived tweets on Wayback Machine and avoid opening each link manually"  # noqa: E501
+)
+
+handle = st.text_input("Username", placeholder="jack")
+
+st.session_state.saved_at = st.slider("Tweets saved between", 2006, year, (2006, year))
+
+not_available = st.checkbox(
+    "Original URLs not available",
+    help="Due to changes in X, it is possible to find available tweets if you are logged into X",  # noqa: E501
+)
+
+query = st.button("Query", type="primary", use_container_width=True)
+
+if handle != st.session_state.current_handle:
+    st.session_state.current_handle = handle
+    st.session_state.offset = 0
+
+if query or st.session_state.count:
+    tweets_per_page = 25
+
+    st.session_state.count = tweets_count(handle, st.session_state.saved_at)
+
+    st.caption(
+        "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
+    )
+    st.write(f"**{st.session_state.count} URLs have been captured**")
+
+    if st.session_state.count:
+        if tweets_per_page > st.session_state.count:
+            tweets_per_page = st.session_state.count
+
+    try:
+        progress = st.empty()
+        links = query_api(
+            handle, tweets_per_page, st.session_state.offset, st.session_state.saved_at
+        )
+
+        parse = parse_links(links)
+        parsed_links = parse[0]
+        tweet_links = parse[1]
+        mimetype = parse[2]
+        timestamp = parse[3]
+
+        if links:
+            st.divider()
+
+            st.session_state.current_handle = handle
+
+            return_none_count = 0
+
+            start_index = st.session_state.offset
+            end_index = min(st.session_state.count, start_index + tweets_per_page)
+
+            with st.spinner("Fetching..."):
+                for i in range(tweets_per_page):
+                    try:
+                        if tweet_links[i]:
+                            link = parsed_links[i]
+                            tweet = embed(tweet_links[i])
+
+                            status = check_double_status(link, tweet_links[i])
+
+                            if not not_available:
+                                attr(i)
+
+                                if tweet:
+                                    status_code = tweet[0]
+                                    tweet_content = tweet[1]
+                                    user_info = tweet[2]
+                                    is_RT = tweet[3]
+
+                                    display_tweet()
+                                elif not tweet:
+                                    display_not_tweet()
+
+                            if not_available:
+                                if not tweet:
+                                    return_none_count += 1
+                                    attr(i)
+
+                                    display_not_tweet()
+
+                                progress.write(
+                                    f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}"  # noqa: E501
+                                )
+
+                            if start_index <= 0:
+                                st.session_state.prev_disabled = True
+                            else:
+                                st.session_state.prev_disabled = False
+
+                            if i + 1 == st.session_state.count:
+                                st.session_state.next_disabled = True
+                            else:
+                                st.session_state.next_disabled = False
+                    except IndexError:
+                        if start_index <= 0:
+                            st.session_state.prev_disabled = True
+                        else:
+                            st.session_state.prev_disabled = False
+
+                        st.session_state.next_disabled = True
+
+            prev, _, next = st.columns([3, 4, 3])
+
+            prev.button(
+                "Previous",
+                disabled=st.session_state.prev_disabled,
+                key="prev_button_key",
+                on_click=prev_page,
+                type="primary",
+                use_container_width=True,
+            )
+            next.button(
+                "Next",
+                disabled=st.session_state.next_disabled,
+                key="next_button_key",
+                on_click=next_page,
+                type="primary",
+                use_container_width=True,
+            )
+
+        if not links:
+            st.error("Unable to query the Wayback Machine API.")
+    except TypeError as e:
+        st.error(
+            f"""
+        {e}. Refresh this page and try again.
+        """  # noqa: E501
+        )
+        st.session_state.offset = 0
diff --git a/legacy_app/requirements.txt b/legacy_app/requirements.txt

new file mode 100644 (file)

index 0000000..6f6d699
--- /dev/null
+++ b/legacy_app/requirements.txt
@@ -0,0 +1,2 @@
+requests==2.30.0
+streamlit==1.27.0
diff --git a/poetry.lock b/poetry.lock

index 58a1cdfbfe1769dc4090e26474a44adeef8664a3..825cb2ab49873ad19bd5e67812e94eb34cb921a8 100644 (file)
--- a/poetry.lock
+++ b/poetry.lock
@@ -307,13 +307,13 @@ files = [
  
  [[package]]
  name = "filelock"
-version = "3.15.3"
+version = "3.15.4"
  description = "A platform independent file lock."
  optional = false
  python-versions = ">=3.8"
  files = [
-    {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"},
-    {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"},
+    {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"},
+    {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"},
  ]
  
  [package.extras]
@@ -1349,13 +1349,13 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=6.0)", "setuptools
  
  [[package]]
  name = "sphinx-autodoc-typehints"
-version = "2.2.1"
+version = "2.2.2"
  description = "Type hints (PEP 484) support for the Sphinx autodoc extension"
  optional = false
  python-versions = ">=3.9"
  files = [
-    {file = "sphinx_autodoc_typehints-2.2.1-py3-none-any.whl", hash = "sha256:ac37852861c58a5ca95be13d5a0f49f3661b5341eaf7de8531842135600aeb90"},
-    {file = "sphinx_autodoc_typehints-2.2.1.tar.gz", hash = "sha256:26a81e6444c9b82a952519a3b7c52e45f14a0f81c91cfc7063cfcf2ca109d161"},
+    {file = "sphinx_autodoc_typehints-2.2.2-py3-none-any.whl", hash = "sha256:b98337a8530c95b73ba0c65465847a8ab0a13403bdc81294d5ef396bbd1f783e"},
+    {file = "sphinx_autodoc_typehints-2.2.2.tar.gz", hash = "sha256:128e600eeef63b722f3d8dac6403594592c8cade3ba66fd11dcb997465ee259d"},
  ]
  
  [package.dependencies]
diff --git a/waybacktweets/api/export.py b/waybacktweets/api/export.py

index a599f68f6add9adacd60bde86ec9eebe09c436d2..af8e804798dbdef442ce0d754fdc4c7e176550a2 100644 (file)
--- a/waybacktweets/api/export.py
+++ b/waybacktweets/api/export.py
@@ -113,7 +113,7 @@ class TweetsExporter:
  
          html_file_path = f"{self.filename}.html"
  
-        html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username)
+        html = HTMLTweetsVisualizer(self.username, json_file_path, html_file_path)
  
          html_content = html.generate()
          html.save(html_content)
diff --git a/waybacktweets/api/request.py b/waybacktweets/api/request.py

index edf68e4b9f23c44ee6b3beddba9862a758d0fbbf..3797fb4403d7e0473d4cd0949d51fdaf6839ea73 100644 (file)
--- a/waybacktweets/api/request.py
+++ b/waybacktweets/api/request.py
@@ -58,12 +58,12 @@ class WaybackTweets:
          """  # noqa: E501
          url = "https://web.archive.org/cdx/search/cdx"
  
-        status_pathname = "status/*"
+        wildcard_pathname = "/*"
          if self.matchtype:
-            status_pathname = ""
+            wildcard_pathname = ""
  
          params = {
-            "url": f"https://twitter.com/{self.username}/{status_pathname}",
+            "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
              "output": "json",
          }
  
diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py

index 369e0f19b2d25709ee2fd8b2b54018bcbdedda39..3f5bfcce2b0e3e105fa48e0722a9e0aa76a3b65a 100644 (file)
--- a/waybacktweets/api/visualize.py
+++ b/waybacktweets/api/visualize.py
@@ -4,7 +4,8 @@ Generates an HTML file to visualize the parsed data.
  """
  
  import json
-from typing import Any, Dict, List
+import os
+from typing import Any, Dict, List, Union
  
  from waybacktweets.utils import timestamp_parser
  
@@ -14,29 +15,37 @@ class HTMLTweetsVisualizer:
      Class responsible for generating an HTML file to visualize the parsed data.
  
      Args:
-        json_file_path (str): The path of the JSON file.
-        html_file_path (str): The path where the HTML file will be saved.
          username (str): The username associated with the tweets.
+        json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
+        html_file_path (str, optional): The path where the HTML file will be saved.
      """
  
-    def __init__(self, json_file_path: str, html_file_path: str, username: str):
+    def __init__(
+        self,
+        username: str,
+        json_file_path: Union[str, List[str]],
+        html_file_path: str = None,
+    ):
+        self.username = username
          self.json_file_path = self._json_loader(json_file_path)
          self.html_file_path = html_file_path
-        self.username = username
  
      @staticmethod
-    def _json_loader(json_file_path: str) -> List[Dict[str, Any]]:
+    def _json_loader(json_file_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
          """
-        Reads and loads JSON data from a specified file path.
+        Reads and loads JSON data from a specified file path or JSON string.
  
          Args:
-            json_file_path (str): The path of the JSON file.
+            json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
  
          Returns:
-            The content of the JSON file.
+            The content of the JSON file or data.
          """
-        with open(json_file_path, "r", encoding="utf-8") as f:
-            return json.load(f)
+        if os.path.isfile(json_file_path):
+            with open(json_file_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+
+        return json.loads(json_file_path)
  
      def generate(self) -> str:
          """
@@ -49,9 +58,9 @@ class HTMLTweetsVisualizer:
          html = f"<html>\n<!-- This content was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
          html += f"\n<head>\n<title>@{self.username}'s archived tweets</title>\n"
          html += "<style>\n"
-        html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
+        html += "body { font-family: monospace; background-color: whitesmoke; color: #1c1e21; margin: 0; padding: 20px; }\n"
          html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
-        html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
+        html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e2e2e2; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
          html += ".tweet strong { font-weight: bold; }\n"
          html += ".tweet a { color: #000000; text-decoration: none; }\n"
          html += ".content { color: #000000; }\n"
@@ -98,10 +107,7 @@ class HTMLTweetsVisualizer:
          for index, tweet in enumerate(self.json_file_path):
              html += '<div class="tweet">\n'
  
-            if (
-                tweet["archived_mimetype"] != "application/json"
-                and not tweet["available_tweet_text"]
-            ):
+            if not tweet["available_tweet_text"]:
                  iframe_src = {
                      "Archived Tweet": tweet["archived_tweet_url"],
                      "Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
@@ -109,6 +115,10 @@ class HTMLTweetsVisualizer:
                      "Parsed Tweet": tweet["parsed_tweet_url"],
                  }
  
+                html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
+                html += f'<p class="source">{tweet["archived_mimetype"]}</p>\n'
+                html += "<br>\n"
+
                  for key, value in iframe_src.items():
                      key_cleaned = key.replace(" ", "_")
  
@@ -138,9 +148,6 @@ class HTMLTweetsVisualizer:
                          index=index, url=value, key_cleaned=key_cleaned
                      )
  
-                html += "<br>\n"
-                html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
-
              if tweet["available_tweet_text"]:
                  html += "<br>\n"
                  html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
@@ -150,7 +157,7 @@ class HTMLTweetsVisualizer:
              html += "<br>\n"
              html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
              html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
-            html += f'<p><strong>Archived mimetype: {tweet["archived_mimetype"]}</strong></p>\n'
+            html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
              html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
              html += (
                  f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'
author	Claromes <claromes@hey.com>
	Sun, 23 Jun 2024 03:46:05 +0000 (00:46 -0300)
committer	Claromes <claromes@hey.com>
	Sun, 23 Jun 2024 03:46:05 +0000 (00:46 -0300)
.streamlit/config.toml		patch \| blob \| history
README.md		patch \| blob \| history
app/app.py		patch \| blob \| history
app/requirements.txt		patch \| blob \| history
assets/download.svg	[new file with mode: 0644]	patch \| blob
assets/parthenon.png		patch \| blob \| history
assets/preview_image.jpg	[new file with mode: 0644]	patch \| blob
assets/waybacktweets.png		patch \| blob \| history
legacy_app/.streamlit/config.toml	[new file with mode: 0644]	patch \| blob
legacy_app/legacy_app.py	[new file with mode: 0644]	patch \| blob
legacy_app/requirements.txt	[new file with mode: 0644]	patch \| blob
poetry.lock		patch \| blob \| history
waybacktweets/api/export.py		patch \| blob \| history
waybacktweets/api/request.py		patch \| blob \| history
waybacktweets/api/visualize.py		patch \| blob \| history