From bab3645fdbf1e61ed2d4f2eace88ecf1a61bea92 Mon Sep 17 00:00:00 2001 From: Claromes Date: Sat, 22 Jun 2024 07:37:35 -0300 Subject: [PATCH] review streamlit app --- app/app.py | 497 ++++++++++++++++++++++----------------------- docs/streamlit.rst | 2 +- 2 files changed, 244 insertions(+), 255 deletions(-) diff --git a/app/app.py b/app/app.py index 867d51a..d8ea1a5 100644 --- a/app/app.py +++ b/app/app.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime import streamlit as st import streamlit.components.v1 as components @@ -7,21 +7,23 @@ from waybacktweets.api.export import TweetsExporter from waybacktweets.api.parse import JsonParser, TweetsParser from waybacktweets.api.request import WaybackTweets from waybacktweets.config.config import config -from waybacktweets.exceptions.exceptions import ( - ConnectionError, - EmptyResponseError, - ReadTimeoutError, -) -from waybacktweets.utils.utils import ( - check_double_status, - get_response, - semicolon_parser, -) # Initial Settings LOGO = "assets/parthenon.png" TITLE = "assets/waybacktweets.png" +FIELD_OPTIONS = [ + "parsed_archived_timestamp", + "archived_tweet_url", + "parsed_archived_tweet_url", + "original_tweet_url", + "parsed_tweet_url", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", + "archived_statuscode", +] st.set_page_config( page_title="Wayback Tweets", @@ -35,7 +37,7 @@ st.set_page_config( The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html). - © 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License + © 2023 - {datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License --- """, # noqa: E501 @@ -92,8 +94,8 @@ if "offset" not in st.session_state: if "count" not in st.session_state: st.session_state.count = False -start_date = datetime.datetime(2006, 3, 1) -end_date = datetime.datetime.now() +start_date = datetime(2006, 1, 1) +end_date = datetime.now() if "archived_timestamp_filter" not in st.session_state: st.session_state.archived_timestamp_filter = (start_date, end_date) @@ -101,7 +103,7 @@ if "archived_timestamp_filter" not in st.session_state: # Verbose mode configuration -config.verbose = False +config.verbose = True # Pagination Settings @@ -136,33 +138,78 @@ def next_page(): @st.cache_data(ttl=1800, show_spinner=False) -def tweets_count(username, archived_timestamp_filter): - url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 +def wayback_tweets( + username, + collapse, + timestamp_from, + timestamp_to, + limit, + offset, + matchtype, +): + response = WaybackTweets( + username, + collapse, + timestamp_from, + timestamp_to, + limit, + offset, + matchtype, + ) + archived_tweets = response.get() - try: - response = get_response(url=url) - - if response.status_code == 200: - data = response.json() - if data and len(data) > 1: - total_tweets = len(data) - 1 - return total_tweets - else: - return 0 - except ReadTimeoutError: - st.error("Connection to web.archive.org timed out.") - st.stop() - except ConnectionError: - st.error("Failed to establish a new connection with web.archive.org.") - st.stop() - except EmptyResponseError: - st.error("No data was saved due to an empty response.") - st.stop() + return archived_tweets -# Interface Settings +@st.cache_data(ttl=1800, show_spinner=False) +def tweets_parser(archived_tweets, field_options): + parser = TweetsParser(archived_tweets, username, field_options) + parsed_tweets = parser.parse() + + return parsed_tweets + + +@st.cache_data(ttl=1800, show_spinner=False) +def tweets_exporter(parsed_tweets, username, field_options): + exporter = TweetsExporter(parsed_tweets, username, field_options) + + df = exporter.dataframe + + return df + + +@st.cache_data(ttl=1800, show_spinner=False) +def tweets_json_parser(): + if archived_mimetype[i] == "application/json": + json_parser = JsonParser(parsed_archived_tweet_url[i]) + text_json = json_parser.parse() + + if text_json: + return text_json + + return None + + +def display_tweet_header(): + header = st.markdown( + f"[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {parsed_archived_timestamp[i]} · **archived status code:** {archived_statuscode[i]}" # noqa: E501 + ) + + return header +def display_tweet_iframe(): + tweet_iframe = components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + + return tweet_iframe + + +# Interface Settings + st.logo(LOGO) st.success( @@ -188,8 +235,8 @@ st.caption( username = st.text_input("Username", placeholder="Without @") -start_date = datetime.datetime(2006, 3, 1) -end_date = datetime.datetime.now() +start_date = datetime(2006, 1, 1) +end_date = datetime.now() st.session_state.archived_timestamp_filter = st.date_input( "Tweets saved between", @@ -206,7 +253,7 @@ not_available = st.checkbox( ) unique = st.checkbox( - "Only unique URLs", + "Only unique Wayback Machine URLs", help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501 ) @@ -221,33 +268,17 @@ if username != st.session_state.current_username: if query or st.session_state.count: tweets_per_page = 25 - st.session_state.count = tweets_count( - username, st.session_state.archived_timestamp_filter - ) - - # st.caption( - # "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 - # ) - # st.write(f"**{st.session_state.count} URLs have been captured**") + collapse = None + matchType = None - if st.session_state.count: - if tweets_per_page > st.session_state.count: - tweets_per_page = st.session_state.count + if unique: + collapse = "urlkey" + matchType = "prefix" try: - # Tweet Listing Processing - - progress = st.empty() - - collapse = None - matchType = None - if unique: - collapse = "urlkey" - matchType = "prefix" - - with st.spinner("Parsing..."): - response = WaybackTweets( - username, + with st.spinner("Waybacking..."): + wayback_tweets = wayback_tweets( + st.session_state.current_username, collapse, st.session_state.archived_timestamp_filter[0], st.session_state.archived_timestamp_filter[1], @@ -255,215 +286,173 @@ if query or st.session_state.count: st.session_state.offset, matchType, ) - archived_tweets = response.get() - - if archived_tweets: - field_options = [ - "archived_urlkey", - "archived_timestamp", - "original_tweet_url", - "archived_tweet_url", - "parsed_tweet_url", - "parsed_archived_tweet_url", - "available_tweet_text", - "available_tweet_is_RT", - "available_tweet_info", - "archived_mimetype", - "archived_statuscode", - ] - - parser = TweetsParser(archived_tweets, username, field_options) - parsed_tweets = parser.parse() - - exporter = TweetsExporter(parsed_tweets, username, field_options) - df = exporter.dataframe - - archived_urlkey = df["archived_urlkey"] - archived_timestamp = df["archived_timestamp"] - original_tweet_url = df["original_tweet_url"] - archived_tweet_url = df["archived_tweet_url"] - parsed_tweet_url = df["parsed_tweet_url"] - parsed_archived_tweet_url = df["parsed_archived_tweet_url"] - available_tweet_text = df["available_tweet_text"] - available_tweet_is_RT = df["available_tweet_is_RT"] - available_tweet_info = df["available_tweet_info"] - archived_mimetype = df["archived_mimetype"] - archived_statuscode = df["archived_statuscode"] - - st.divider() - st.session_state.current_username = username - - return_none_count = 0 - - start_index = st.session_state.offset - end_index = min(st.session_state.count, start_index + tweets_per_page) - - for i in range(tweets_per_page): - try: - if archived_mimetype[i] == "application/json": - json_parser = JsonParser(parsed_archived_tweet_url[i]) - text_json = json_parser.parse() - - if text_json: - parsed_text_json = semicolon_parser(text_json) - - # Display all tweets - if not not_available: - # Display available tweets - if available_tweet_text[i]: - st.markdown( - f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 - ) - if available_tweet_is_RT[i]: - st.info("*Retweet*") + parsed_tweets = tweets_parser(wayback_tweets, FIELD_OPTIONS) + df = tweets_exporter( + parsed_tweets, st.session_state.current_username, FIELD_OPTIONS + ) - st.write(available_tweet_text[i]) - st.write(f"**{available_tweet_info[i]}**") + st.session_state.count = len(df) - st.divider() - # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501 - elif ( - archived_mimetype[i] != "application/json" - and not available_tweet_text[i] - ): - st.markdown( - f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + # st.caption( + # "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 + # ) + # st.write(f"**{st.session_state.count} URLs have been captured**") + + if st.session_state.count: + if tweets_per_page > st.session_state.count: + tweets_per_page = st.session_state.count + + # Tweet Listing Processing + + progress = st.empty() + + parsed_archived_timestamp = df["parsed_archived_timestamp"] + archived_tweet_url = df["archived_tweet_url"] + parsed_archived_tweet_url = df["parsed_archived_tweet_url"] + original_tweet_url = df["original_tweet_url"] + parsed_tweet_url = df["parsed_tweet_url"] + available_tweet_text = df["available_tweet_text"] + available_tweet_is_RT = df["available_tweet_is_RT"] + available_tweet_info = df["available_tweet_info"] + archived_mimetype = df["archived_mimetype"] + archived_statuscode = df["archived_statuscode"] + + st.divider() + st.session_state.current_username = username + + return_none_count = 0 + + start_index = st.session_state.offset + end_index = min(st.session_state.count, start_index + tweets_per_page) + + for i in range(tweets_per_page): + try: + parsed_text_json = tweets_json_parser() + + # Display all tweets + if not not_available: + # Display available tweets + if available_tweet_text[i]: + display_tweet_header() + + if available_tweet_is_RT[i]: + st.info("*Retweet*") + + st.write(available_tweet_text[i]) + st.write(f"**{available_tweet_info[i]}**") + + st.divider() + # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501 + elif ( + archived_mimetype[i] != "application/json" + and not available_tweet_text[i] + ): + display_tweet_header() + if ( + ".jpg" in parsed_tweet_url[i] + or ".png" in parsed_tweet_url[i] + ) and (400 <= archived_statuscode[i] <= 511): + display_tweet_iframe() + elif "/status/" not in parsed_tweet_url[i]: + st.info( + "This isn't a status or is not available" # noqa: E501 ) - if ( - ".jpg" in original_tweet_url[i] - or ".png" in original_tweet_url[i] - ) and (400 <= archived_statuscode[i] <= 511): - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - elif "/status/" not in original_tweet_url[i]: - st.info( - "This isn't a status or is not available" # noqa: E501 - ) - elif ( - check_double_status( - archived_tweet_url[i], - original_tweet_url[i], - ) - or f"{st.session_state.current_username}" - not in original_tweet_url[i] - ): - st.info( - f"Replying to {st.session_state.current_username}" # noqa: E501 - ) - else: - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - - st.divider() - # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501 elif ( - archived_mimetype[i] == "application/json" - and not available_tweet_text[i] + f"{st.session_state.current_username}" + not in parsed_tweet_url[i] ): - st.markdown( - f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + st.info( + f"Replying to {st.session_state.current_username}" # noqa: E501 ) - st.code(parsed_text_json) - - st.divider() - - # Display only tweets not available - if not_available: - # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 + else: + display_tweet_iframe() + + st.divider() + # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501 + elif ( + archived_mimetype[i] == "application/json" + and not available_tweet_text[i] + ): + display_tweet_header() + st.code(parsed_text_json) + + st.divider() + + # Display only tweets not available + if not_available: + # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 + if ( + archived_mimetype[i] != "application/json" + and not available_tweet_text[i] + ): + return_none_count += 1 + + display_tweet_header() if ( - archived_mimetype[i] != "application/json" - and not available_tweet_text[i] - ): - return_none_count += 1 - - st.markdown( - f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + ".jpg" in parsed_tweet_url[i] + or ".png" in parsed_tweet_url[i] + ) and (400 <= archived_statuscode[i] <= 511): + display_tweet_iframe() + elif "/status/" not in parsed_tweet_url[i]: + st.info( + "This isn't a status or is not available" # noqa: E501 ) - if ( - ".jpg" in original_tweet_url[i] - or ".png" in original_tweet_url[i] - ) and (400 <= archived_statuscode[i] <= 511): - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - elif "/status/" not in original_tweet_url[i]: - st.info( - "This isn't a status or is not available" # noqa: E501 - ) - elif ( - check_double_status( - archived_tweet_url[i], - original_tweet_url[i], - ) - or f"{st.session_state.current_username}" - not in original_tweet_url[i] - ): - st.info( - f"Replying to {st.session_state.current_username}" # noqa: E501 - ) - else: - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - - st.divider() - - # Display tweets not available with application/json return # noqa: E501 elif ( - archived_mimetype[i] == "application/json" - and not available_tweet_text[i] + f"{st.session_state.current_username}" + not in parsed_tweet_url[i] ): - return_none_count += 1 - - st.markdown( - f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + st.info( + f"Replying to {st.session_state.current_username}" # noqa: E501 ) - st.code(parsed_text_json) + else: + display_tweet_iframe() + + st.divider() + + # Display tweets not available with application/json return # noqa: E501 + elif ( + archived_mimetype[i] == "application/json" + and not available_tweet_text[i] + ): + return_none_count += 1 + + display_tweet_header() + st.code(parsed_text_json) - st.divider() + st.divider() - progress.write( - f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501 - ) - except IndexError: - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False + progress.write( + f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501 + ) + except IndexError: + if start_index <= 0: + st.session_state.prev_disabled = True + else: + st.session_state.prev_disabled = False - st.session_state.next_disabled = True + st.session_state.next_disabled = True prev, _, next = st.columns([3, 4, 3]) - prev.button( - "Previous", - disabled=st.session_state.prev_disabled, - key="prev_button_key", - on_click=prev_page, - type="primary", - use_container_width=True, - ) - next.button( - "Next", - disabled=st.session_state.next_disabled, - key="next_button_key", - on_click=next_page, - type="primary", - use_container_width=True, - ) + prev.button( + "Previous", + disabled=st.session_state.prev_disabled, + key="prev_button_key", + on_click=prev_page, + type="primary", + use_container_width=True, + ) + next.button( + "Next", + disabled=st.session_state.next_disabled, + key="next_button_key", + on_click=next_page, + type="primary", + use_container_width=True, + ) - if not archived_tweets: + if not wayback_tweets: st.error( "Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501 ) diff --git a/docs/streamlit.rst b/docs/streamlit.rst index f1880b7..9c91b05 100644 --- a/docs/streamlit.rst +++ b/docs/streamlit.rst @@ -15,7 +15,7 @@ Filters - Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the :ref:`flowchart`) -- Only unique URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix`` +- Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix`` Pagination -- 2.34.1