update HTML, update docs app and spinner, update print msg, delete image preview
authorClaromes <claromes@hey.com>
Mon, 24 Jun 2024 11:40:21 +0000 (08:40 -0300)
committerClaromes <claromes@hey.com>
Mon, 24 Jun 2024 11:40:21 +0000 (08:40 -0300)
app/app.py
assets/preview_image.jpg [deleted file]
waybacktweets/_cli.py
waybacktweets/api/export.py
waybacktweets/api/parse.py
waybacktweets/api/visualize.py

index 8a209d51257010d78989ad13b7c904652f456771..19804497c2fe108f23f944b42e5680e566b86a84 100644 (file)
@@ -1,5 +1,5 @@
 import base64
-from datetime import datetime
+from datetime import datetime, timedelta
 
 import streamlit as st
 
@@ -13,13 +13,13 @@ from waybacktweets.config import FIELD_OPTIONS, config
 
 PAGE_ICON = "assets/parthenon.png"
 TITLE = "assets/waybacktweets.png"
-PREVIEW_IMAGE = "assets/preview_image.jpg"
 DOWNLOAD = "assets/download.svg"
 
 collapse = None
 matchtype = None
-start_date = datetime(2006, 1, 1)
+start_date = datetime.now() - timedelta(days=365 * 2)
 end_date = datetime.now()
+min_date = datetime(2006, 1, 1)
 
 # ------ Verbose Mode Configuration ------ #
 
@@ -81,7 +81,7 @@ st.html(
 # ------ Requestings ------ #
 
 
-@st.cache_data(ttl=600, show_spinner=True)
+@st.cache_data(ttl=600, show_spinner=False)
 def wayback_tweets(
     username,
     collapse,
@@ -105,7 +105,7 @@ def wayback_tweets(
     return archived_tweets
 
 
-@st.cache_data(ttl=600, show_spinner=True)
+@st.cache_data(ttl=600, show_spinner=False)
 def tweets_parser(archived_tweets, username, field_options):
     parser = TweetsParser(archived_tweets, username, field_options)
     parsed_tweets = parser.parse()
@@ -113,7 +113,7 @@ def tweets_parser(archived_tweets, username, field_options):
     return parsed_tweets
 
 
-@st.cache_data(ttl=600, show_spinner=True)
+@st.cache_data(ttl=600, show_spinner=False)
 def tweets_exporter(parsed_tweets, username, field_options):
     exporter = TweetsExporter(parsed_tweets, username, field_options)
 
@@ -135,11 +135,11 @@ st.caption(
 )
 st.write("Retrieve archived tweets CDX data in CSV, JSON, and HTML formats.")
 
-st.caption(
+st.write(
     "This application uses the Wayback Tweets Python package, which can be used as a module or as a standalone command line tool. [Read the documentation](https://claromes.github.io/waybacktweets)."  # noqa: E501
 )
 
-st.caption(
+st.write(
     "To access the legacy version of Wayback Tweets [click here](https://waybacktweets-legacy.streamlit.app)."  # noqa: E501
 )
 
@@ -150,13 +150,14 @@ st.divider()
 username = st.text_input("Username *", key="username", placeholder="Without @")
 
 with st.expander("Filtering"):
-    start_date = datetime(2006, 1, 1)
-    end_date = datetime.now()
 
+    st.caption(
+        ":orange[A large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]"  # noqa: E501
+    )
     st.session_state.archived_timestamp_filter = st.date_input(
         "Tweets saved between",
         (start_date, end_date),
-        start_date,
+        min_date,
         end_date,
         format="YYYY/MM/DD",
         help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
@@ -178,21 +179,11 @@ with st.expander("Filtering"):
             help="Allows for a simple way to scroll through the results",
         )
 
-    col3, col4 = st.columns(2)
-
-    with col3:
-        not_available = st.checkbox(
-            "Only tweets not available",
-            key="not_available",
-            help="Checks if the archived URL still exists on Twitter",
-        )
-
-    with col4:
-        unique = st.checkbox(
-            "Only unique Wayback Machine URLs",
-            key="unique",
-            help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`",  # noqa: E501
-        )
+    unique = st.checkbox(
+        "Only unique Wayback Machine URLs",
+        key="unique",
+        help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`",  # noqa: E501
+    )
 
 
 query = st.button("Query", type="primary", use_container_width=True)
@@ -208,102 +199,111 @@ if query or st.session_state.count:
         matchtype = "prefix"
 
     try:
-        wayback_tweets = wayback_tweets(
-            st.session_state.current_username,
-            collapse,
-            st.session_state.archived_timestamp_filter[0],
-            st.session_state.archived_timestamp_filter[1],
-            limit,
-            offset,
-            matchtype,
-        )
+        with st.spinner(
+            f"Waybacking @{st.session_state.current_username}'s archived tweets"
+        ):
+            wayback_tweets = wayback_tweets(
+                st.session_state.current_username,
+                collapse,
+                st.session_state.archived_timestamp_filter[0],
+                st.session_state.archived_timestamp_filter[1],
+                limit,
+                offset,
+                matchtype,
+            )
 
         if not wayback_tweets:
             st.error("No data was saved due to an empty response.")
             st.stop()
 
-        parsed_tweets = tweets_parser(
-            wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
-        )
+        with st.spinner(
+            f"Parsing @{st.session_state.current_username}'s archived tweets"
+        ):
+            parsed_tweets = tweets_parser(
+                wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
+            )
 
-        df, file_name = tweets_exporter(
-            parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
-        )
+            df, file_name = tweets_exporter(
+                parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
+            )
 
         csv_data = df.to_csv(index=False)
         json_data = df.to_json(orient="records", lines=False)
         html = HTMLTweetsVisualizer(username, json_data)
         html_content = html.generate()
 
-        st.session_state.count = len(df)
-        st.write(f"**{st.session_state.count} URLs have been captured**")
+        # -- Rendering -- #
 
-        # -- HTML -- #
+        if csv_data and json_data and html_content:
+            st.session_state.count = len(df)
+            st.write(f"**{st.session_state.count} URLs have been captured**")
 
-        st.header("HTML", divider="gray")
-        st.write(
-            f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML."  # noqa: E501
-        )
+            # -- HTML -- #
 
-        col5, col6 = st.columns([1, 18])
+            st.header("HTML", divider="gray")
+            st.write(
+                f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML."  # noqa: E501
+            )
 
-        with col5:
-            st.image(DOWNLOAD, width=22)
+            col5, col6 = st.columns([1, 18])
 
-        with col6:
-            b64_html = base64.b64encode(html_content.encode()).decode()
-            href_html = f"data:text/html;base64,{b64_html}"
+            with col5:
+                st.image(DOWNLOAD, width=22)
 
-            st.markdown(
-                f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>',  # noqa: E501
-                unsafe_allow_html=True,
-            )
+            with col6:
+                b64_html = base64.b64encode(html_content.encode()).decode()
+                href_html = f"data:text/html;base64,{b64_html}"
 
-        st.image(PREVIEW_IMAGE, "Preview image")
+                st.markdown(
+                    f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>',  # noqa: E501
+                    unsafe_allow_html=True,
+                )
 
-        # -- CSV -- #
+            # -- CSV -- #
 
-        st.header("CSV", divider="gray")
-        st.write(
-            "Check the data returned in the dataframe below and download the file."
-        )
+            st.header("CSV", divider="gray")
+            st.write(
+                "Check the data returned in the dataframe below and download the file."
+            )
 
-        col7, col8 = st.columns([1, 18])
+            col7, col8 = st.columns([1, 18])
 
-        with col7:
-            st.image(DOWNLOAD, width=22)
+            with col7:
+                st.image(DOWNLOAD, width=22)
 
-        with col8:
-            b64_csv = base64.b64encode(csv_data.encode()).decode()
-            href_csv = f"data:file/csv;base64,{b64_csv}"
+            with col8:
+                b64_csv = base64.b64encode(csv_data.encode()).decode()
+                href_csv = f"data:file/csv;base64,{b64_csv}"
 
-            st.markdown(
-                f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>',  # noqa: E501
-                unsafe_allow_html=True,
-            )
+                st.markdown(
+                    f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>',  # noqa: E501
+                    unsafe_allow_html=True,
+                )
 
-        st.dataframe(df, use_container_width=True)
+            st.dataframe(df, use_container_width=True)
 
-        # -- JSON -- #
+            # -- JSON -- #
 
-        st.header("JSON", divider="gray")
-        st.write("Check the data returned in JSON format below and download the file.")
+            st.header("JSON", divider="gray")
+            st.write(
+                "Check the data returned in JSON format below and download the file."
+            )
 
-        col9, col10 = st.columns([1, 18])
+            col9, col10 = st.columns([1, 18])
 
-        with col9:
-            st.image(DOWNLOAD, width=22)
+            with col9:
+                st.image(DOWNLOAD, width=22)
 
-        with col10:
-            b64_json = base64.b64encode(json_data.encode()).decode()
-            href_json = f"data:file/json;base64,{b64_json}"
+            with col10:
+                b64_json = base64.b64encode(json_data.encode()).decode()
+                href_json = f"data:file/json;base64,{b64_json}"
 
-            st.markdown(
-                f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>',  # noqa: E501
-                unsafe_allow_html=True,
-            )
+                st.markdown(
+                    f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>',  # noqa: E501
+                    unsafe_allow_html=True,
+                )
 
-        st.json(json_data, expanded=False)
+            st.json(json_data, expanded=False)
     except TypeError as e:
         st.error(
             f"""
diff --git a/assets/preview_image.jpg b/assets/preview_image.jpg
deleted file mode 100644 (file)
index cf4633d..0000000
Binary files a/assets/preview_image.jpg and /dev/null differ
index d115c09bd30ac8faef0d28d44b54efb9e1208d24..4048fc780def996c25f2ef3b5c861bbb944ae339 100644 (file)
@@ -121,7 +121,7 @@ def main(
             username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
         )
 
-        print("Making a request to the Internet Archive...")
+        print(f"Waybacking @{username}'s archived tweets...")
         archived_tweets = api.get()
 
         if archived_tweets:
index af8e804798dbdef442ce0d754fdc4c7e176550a2..6524bfee34984eec6c48b104f81162f2e616d4dd 100644 (file)
@@ -97,23 +97,23 @@ class TweetsExporter:
         """
         Saves the DataFrame to a JSON file.
         """
-        json_file_path = f"{self.filename}.json"
-        self.dataframe.to_json(json_file_path, orient="records", lines=False)
+        json_path = f"{self.filename}.json"
+        self.dataframe.to_json(json_path, orient="records", lines=False)
 
-        print(f"Saved to {json_file_path}")
+        print(f"Saved to {json_path}")
 
     def save_to_html(self) -> None:
         """
         Saves the DataFrame to an HTML file.
         """
-        json_file_path = f"{self.filename}.json"
+        json_path = f"{self.filename}.json"
 
-        if not os.path.exists(json_file_path):
+        if not os.path.exists(json_path):
             self.save_to_json()
 
         html_file_path = f"{self.filename}.html"
 
-        html = HTMLTweetsVisualizer(self.username, json_file_path, html_file_path)
+        html = HTMLTweetsVisualizer(self.username, json_path, html_file_path)
 
         html_content = html.generate()
         html.save(html_content)
index 65ad04177c757c15f6daa3acea65a8f38c9d30e7..31e0e3430b10b62e137ebf62ad9fe937f3c2a482 100644 (file)
@@ -279,7 +279,8 @@ class TweetsParser:
                 task = None
                 if print_progress:
                     task = progress.add_task(
-                        f"Waybacking @{self.username} tweets\n", total=len(futures)
+                        f"Parsing @{self.username}'s archived tweets\n",
+                        total=len(futures),
                     )
 
                 for future in as_completed(futures):
index 3f5bfcce2b0e3e105fa48e0722a9e0aa76a3b65a..e1e9e8e14d2a0edf56fc921eaef9b18104e745b2 100644 (file)
@@ -16,36 +16,36 @@ class HTMLTweetsVisualizer:
 
     Args:
         username (str): The username associated with the tweets.
-        json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
+        json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
         html_file_path (str, optional): The path where the HTML file will be saved.
     """
 
     def __init__(
         self,
         username: str,
-        json_file_path: Union[str, List[str]],
+        json_path: Union[str, List[str]],
         html_file_path: str = None,
     ):
         self.username = username
-        self.json_file_path = self._json_loader(json_file_path)
+        self.json_path = self._json_loader(json_path)
         self.html_file_path = html_file_path
 
     @staticmethod
-    def _json_loader(json_file_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
+    def _json_loader(json_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
         """
         Reads and loads JSON data from a specified file path or JSON string.
 
         Args:
-            json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
+            json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
 
         Returns:
             The content of the JSON file or data.
         """
-        if os.path.isfile(json_file_path):
-            with open(json_file_path, "r", encoding="utf-8") as f:
+        if os.path.isfile(json_path):
+            with open(json_path, "r", encoding="utf-8") as f:
                 return json.load(f)
 
-        return json.loads(json_file_path)
+        return json.loads(json_path)
 
     def generate(self) -> str:
         """
@@ -104,7 +104,7 @@ class HTMLTweetsVisualizer:
         html += f"<h1>@{self.username}'s archived tweets</h1>\n"
         html += '<div class="container">\n'
 
-        for index, tweet in enumerate(self.json_file_path):
+        for index, tweet in enumerate(self.json_path):
             html += '<div class="tweet">\n'
 
             if not tweet["available_tweet_text"]:
@@ -115,10 +115,6 @@ class HTMLTweetsVisualizer:
                     "Parsed Tweet": tweet["parsed_tweet_url"],
                 }
 
-                html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
-                html += f'<p class="source">{tweet["archived_mimetype"]}</p>\n'
-                html += "<br>\n"
-
                 for key, value in iframe_src.items():
                     key_cleaned = key.replace(" ", "_")
 
@@ -155,6 +151,12 @@ class HTMLTweetsVisualizer:
                 html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
 
             html += "<br>\n"
+            html += f'<p><strong>Archived Tweet:</strong> {tweet["archived_tweet_url"]}</p>\n'
+            html += f'<p><strong>Parsed Archived Tweet:</strong> {tweet["parsed_archived_tweet_url"]}</p>\n'
+            html += f'<p><strong>Original Tweet:</strong> {tweet["original_tweet_url"]}</p>\n'
+            html += (
+                f'<p><strong>Parsed Tweet:</strong> {tweet["parsed_tweet_url"]}</p>\n'
+            )
             html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
             html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
             html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'