secondaryBackgroundColor = "gainsboro"
textColor = "black"
backgroundColor = "whitesmoke"
-font = "sans serif"
+font = "serif"
[client]
toolbarMode = "minimal"
- Tristan Lee (Bellingcat's Data Scientist) for the idea of the application.
- Jessica Smith (Snowflake's Marketing Specialist) and Streamlit/Snowflake teams for the additional server resources on Streamlit Cloud.
- OSINT Community for recommending the application.
+
+> [!NOTE]
+> If the application is down, please check the [Streamlit Cloud Status](https://www.streamlitstatus.com/).
+import base64
from datetime import datetime
import streamlit as st
-import streamlit.components.v1 as components
from waybacktweets.api.export import TweetsExporter
-from waybacktweets.api.parse import JsonParser, TweetsParser
+from waybacktweets.api.parse import TweetsParser
from waybacktweets.api.request import WaybackTweets
-from waybacktweets.config.config import config
+from waybacktweets.api.visualize import HTMLTweetsVisualizer
+from waybacktweets.config import FIELD_OPTIONS, config
-# Initial Settings
+# ------ Initial Settings ------ #
-LOGO = "assets/parthenon.png"
+PAGE_ICON = "assets/parthenon.png"
TITLE = "assets/waybacktweets.png"
-FIELD_OPTIONS = [
- "parsed_archived_timestamp",
- "archived_tweet_url",
- "parsed_archived_tweet_url",
- "original_tweet_url",
- "parsed_tweet_url",
- "available_tweet_text",
- "available_tweet_is_RT",
- "available_tweet_info",
- "archived_mimetype",
- "archived_statuscode",
-]
+PREVIEW_IMAGE = "assets/preview_image.jpg"
+DOWNLOAD = "assets/download.svg"
+
+collapse = None
+matchtype = None
+start_date = datetime(2006, 1, 1)
+end_date = datetime.now()
+
+# ------ Verbose Mode Configuration ------ #
+
+config.verbose = True
+
+# ------ Page Configuration ------ #
st.set_page_config(
page_title="Wayback Tweets",
- page_icon=LOGO,
+ page_icon=PAGE_ICON,
layout="centered",
menu_items={
"About": f"""
[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [](https://github.com/claromes/waybacktweets)
- Application that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
+ The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and download the data in different formats.
- The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html).
-
- © 2023 - {datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License
+ © 2023 - {end_date.year}, [Claromes](https://claromes.com)
---
""", # noqa: E501
},
)
-# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
-st.html(
- """
-<style>
- header[data-testid="stHeader"] {
- opacity: 0.5;
- }
- iframe {
- border: 1px solid #dddddd;
- border-radius: 0.5rem;
- }
- div[data-testid="InputInstructions"] {
- visibility: hidden;
- }
- img[data-testid="stLogo"] {
- scale: 4;
- padding-left: 10px;
- }
- button[data-testid="StyledFullScreenButton"] {
- display: none;
- }
-</style>
-"""
-)
+# ------ Set States ------ #
if "current_username" not in st.session_state:
st.session_state.current_username = ""
-if "prev_disabled" not in st.session_state:
- st.session_state.prev_disabled = False
-
-if "next_disabled" not in st.session_state:
- st.session_state.next_disabled = False
-
-if "next_button" not in st.session_state:
- st.session_state.next_button = False
-
-if "prev_button" not in st.session_state:
- st.session_state.prev_button = False
-
-if "update_component" not in st.session_state:
- st.session_state.update_component = 0
-
-if "offset" not in st.session_state:
- st.session_state.offset = 0
-
if "count" not in st.session_state:
st.session_state.count = False
-start_date = datetime(2006, 1, 1)
-end_date = datetime.now()
-
if "archived_timestamp_filter" not in st.session_state:
st.session_state.archived_timestamp_filter = (start_date, end_date)
+# ------ Add Custom CSS Style ------ #
-# Verbose mode configuration
-
-config.verbose = True
-
-
-# Pagination Settings
-
-
-def scroll_into_view():
- script = f"""
- <script>
- window.parent.document.querySelector('section.main').scrollTo(0, 0);
- let update_component = {st.session_state.update_component}
- </script>
+st.html(
"""
+ <style>
+ header[data-testid="stHeader"] {
+ opacity: 0.5;
+ }
+ iframe {
+ border: 1px solid #dddddd;
+ border-radius: 0.5rem;
+ }
+ div[data-testid="InputInstructions"] {
+ visibility: hidden;
+ }
+ button[data-testid="StyledFullScreenButton"] {
+ display: none;
+ }
+ </style>
+ """
+)
- components.html(script, width=0, height=0)
-
-
-def prev_page():
- st.session_state.offset -= tweets_per_page
-
- st.session_state.update_component += 1
- scroll_into_view()
-
-
-def next_page():
- st.session_state.offset += tweets_per_page
-
- st.session_state.update_component += 1
- scroll_into_view()
-
-
-# Requesting
+# ------ Requestings ------ #
-@st.cache_data(ttl=1800, show_spinner=False)
+@st.cache_data(ttl=600, show_spinner=True)
def wayback_tweets(
username,
collapse,
return archived_tweets
-@st.cache_data(ttl=1800, show_spinner=False)
-def tweets_parser(archived_tweets, field_options):
+@st.cache_data(ttl=600, show_spinner=True)
+def tweets_parser(archived_tweets, username, field_options):
parser = TweetsParser(archived_tweets, username, field_options)
parsed_tweets = parser.parse()
return parsed_tweets
-@st.cache_data(ttl=1800, show_spinner=False)
+@st.cache_data(ttl=600, show_spinner=True)
def tweets_exporter(parsed_tweets, username, field_options):
exporter = TweetsExporter(parsed_tweets, username, field_options)
df = exporter.dataframe
+ file_name = exporter.filename
- return df
+ return df, file_name
-@st.cache_data(ttl=1800, show_spinner=False)
-def tweets_json_parser():
- if archived_mimetype[i] == "application/json":
- json_parser = JsonParser(parsed_archived_tweet_url[i])
- text_json = json_parser.parse()
+# ------ User Interface Settings ------ #
- if text_json:
- return text_json
+st.info(
+ """🥳 [**Pre-release 1.0x: New Streamlit app, CLI and Python module**](https://claromes.github.io/waybacktweets)""" # noqa: E501
+)
- return None
+st.image(TITLE, use_column_width="never")
+st.caption(
+ "[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)" # noqa: E501
+)
+st.write("Retrieve archived tweets CDX data in CSV, JSON, and HTML formats.")
+st.caption(
+ "This application uses the Wayback Tweets Python package, which can be used as a module or as a standalone command line tool. [Read the documentation](https://claromes.github.io/waybacktweets)." # noqa: E501
+)
-def display_tweet_header():
- header = st.markdown(
- f"[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {parsed_archived_timestamp[i]} · **archived status code:** {archived_statuscode[i]}" # noqa: E501
- )
+st.caption(
+ "To access the legacy version of Wayback Tweets [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
+)
+
+st.divider()
- return header
+# -- Filters -- #
+username = st.text_input("Username *", key="username", placeholder="Without @")
-def display_tweet_iframe():
- tweet_iframe = components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
+with st.expander("Filtering"):
+ start_date = datetime(2006, 1, 1)
+ end_date = datetime.now()
+
+ st.session_state.archived_timestamp_filter = st.date_input(
+ "Tweets saved between",
+ (start_date, end_date),
+ start_date,
+ end_date,
+ format="YYYY/MM/DD",
+ help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
)
- return tweet_iframe
+ col1, col2 = st.columns(2)
+
+ with col1:
+ limit = st.text_input(
+ "Limit",
+ key="limit",
+ help="Query result limits",
+ )
+ with col2:
+ offset = st.text_input(
+ "Offset",
+ key="offset",
+ help="Allows for a simple way to scroll through the results",
+ )
-# Interface Settings
+ col3, col4 = st.columns(2)
-st.logo(LOGO)
+ with col3:
+ not_available = st.checkbox(
+ "Only tweets not available",
+ key="not_available",
+ help="Checks if the archived URL still exists on Twitter",
+ )
-st.success(
- """**v1.0 🎉: CLI and Python Module**
+ with col4:
+ unique = st.checkbox(
+ "Only unique Wayback Machine URLs",
+ key="unique",
+ help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
+ )
-$ `pip install waybacktweets`
-$ `waybacktweets --from 20150101 --to 20191231 --limit 250 jack`
+query = st.button("Query", type="primary", use_container_width=True)
-Retrieve archived tweets CDX data in CSV, JSON, and HTML formats using the command line.
+# ------ Results ------ #
-Read the documentation: [claromes.github.io/waybacktweets](https://claromes.github.io/waybacktweets).""" # noqa: E501
-)
+if username != st.session_state.current_username:
+ st.session_state.current_username = username
-st.image(TITLE, use_column_width="never")
-st.caption(
- "[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)" # noqa: E501
-)
-st.caption("Display multiple archived tweets on Wayback Machine.")
-st.caption(
- "Download data via command line with the [`waybacktweets`](https://pypi.org/project/waybacktweets) Python package." # noqa: E501
-)
+if query or st.session_state.count:
+ if unique:
+ collapse = "urlkey"
+ matchtype = "prefix"
-username = st.text_input("Username", placeholder="Without @")
+ try:
+ wayback_tweets = wayback_tweets(
+ st.session_state.current_username,
+ collapse,
+ st.session_state.archived_timestamp_filter[0],
+ st.session_state.archived_timestamp_filter[1],
+ limit,
+ offset,
+ matchtype,
+ )
-start_date = datetime(2006, 1, 1)
-end_date = datetime.now()
+ if not wayback_tweets:
+ st.error("No data was saved due to an empty response.")
+ st.stop()
-st.session_state.archived_timestamp_filter = st.date_input(
- "Tweets saved between",
- (start_date, end_date),
- start_date,
- end_date,
- format="YYYY/MM/DD",
- help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
-)
+ parsed_tweets = tweets_parser(
+ wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
+ )
-not_available = st.checkbox(
- "Only tweets not available",
- help="Checks if the archived URL still exists on Twitter",
-)
+ df, file_name = tweets_exporter(
+ parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
+ )
-unique = st.checkbox(
- "Only unique Wayback Machine URLs",
- help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
-)
+ csv_data = df.to_csv(index=False)
+ json_data = df.to_json(orient="records", lines=False)
+ html = HTMLTweetsVisualizer(username, json_data)
+ html_content = html.generate()
-query = st.button("Query", type="primary", use_container_width=True)
+ st.session_state.count = len(df)
+ st.write(f"**{st.session_state.count} URLs have been captured**")
-# Tweet Listing Settings
+ # -- HTML -- #
-if username != st.session_state.current_username:
- st.session_state.current_username = username
- st.session_state.offset = 0
+ st.header("HTML", divider="gray")
+ st.write(
+ f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
+ )
-if query or st.session_state.count:
- tweets_per_page = 25
+ col5, col6 = st.columns([1, 18])
- collapse = None
- matchType = None
+ with col5:
+ st.image(DOWNLOAD, width=22)
- if unique:
- collapse = "urlkey"
- matchType = "prefix"
+ with col6:
+ b64_html = base64.b64encode(html_content.encode()).decode()
+ href_html = f"data:text/html;base64,{b64_html}"
- try:
- with st.spinner("Waybacking..."):
- wayback_tweets = wayback_tweets(
- st.session_state.current_username,
- collapse,
- st.session_state.archived_timestamp_filter[0],
- st.session_state.archived_timestamp_filter[1],
- tweets_per_page,
- st.session_state.offset,
- matchType,
+ st.markdown(
+ f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
+ unsafe_allow_html=True,
)
- parsed_tweets = tweets_parser(wayback_tweets, FIELD_OPTIONS)
- df = tweets_exporter(
- parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
- )
+ st.image(PREVIEW_IMAGE, "Preview image")
- st.session_state.count = len(df)
-
- # st.caption(
- # "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
- # )
- # st.write(f"**{st.session_state.count} URLs have been captured**")
-
- if st.session_state.count:
- if tweets_per_page > st.session_state.count:
- tweets_per_page = st.session_state.count
-
- # Tweet Listing Processing
-
- progress = st.empty()
-
- parsed_archived_timestamp = df["parsed_archived_timestamp"]
- archived_tweet_url = df["archived_tweet_url"]
- parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
- original_tweet_url = df["original_tweet_url"]
- parsed_tweet_url = df["parsed_tweet_url"]
- available_tweet_text = df["available_tweet_text"]
- available_tweet_is_RT = df["available_tweet_is_RT"]
- available_tweet_info = df["available_tweet_info"]
- archived_mimetype = df["archived_mimetype"]
- archived_statuscode = df["archived_statuscode"]
-
- st.divider()
- st.session_state.current_username = username
-
- return_none_count = 0
-
- start_index = st.session_state.offset
- end_index = min(st.session_state.count, start_index + tweets_per_page)
-
- for i in range(tweets_per_page):
- try:
- parsed_text_json = tweets_json_parser()
-
- # Display all tweets
- if not not_available:
- # Display available tweets
- if available_tweet_text[i]:
- display_tweet_header()
-
- if available_tweet_is_RT[i]:
- st.info("*Retweet*")
-
- st.write(available_tweet_text[i])
- st.write(f"**{available_tweet_info[i]}**")
-
- st.divider()
- # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501
- elif (
- archived_mimetype[i] != "application/json"
- and not available_tweet_text[i]
- ):
- display_tweet_header()
- if (
- ".jpg" in parsed_tweet_url[i]
- or ".png" in parsed_tweet_url[i]
- ) and (400 <= archived_statuscode[i] <= 511):
- display_tweet_iframe()
- elif "/status/" not in parsed_tweet_url[i]:
- st.info(
- "This isn't a status or is not available" # noqa: E501
- )
- elif (
- f"{st.session_state.current_username}"
- not in parsed_tweet_url[i]
- ):
- st.info(
- f"Replying to {st.session_state.current_username}" # noqa: E501
- )
- else:
- display_tweet_iframe()
-
- st.divider()
- # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501
- elif (
- archived_mimetype[i] == "application/json"
- and not available_tweet_text[i]
- ):
- display_tweet_header()
- st.code(parsed_text_json)
-
- st.divider()
-
- # Display only tweets not available
- if not_available:
- # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
- if (
- archived_mimetype[i] != "application/json"
- and not available_tweet_text[i]
- ):
- return_none_count += 1
-
- display_tweet_header()
- if (
- ".jpg" in parsed_tweet_url[i]
- or ".png" in parsed_tweet_url[i]
- ) and (400 <= archived_statuscode[i] <= 511):
- display_tweet_iframe()
- elif "/status/" not in parsed_tweet_url[i]:
- st.info(
- "This isn't a status or is not available" # noqa: E501
- )
- elif (
- f"{st.session_state.current_username}"
- not in parsed_tweet_url[i]
- ):
- st.info(
- f"Replying to {st.session_state.current_username}" # noqa: E501
- )
- else:
- display_tweet_iframe()
-
- st.divider()
-
- # Display tweets not available with application/json return # noqa: E501
- elif (
- archived_mimetype[i] == "application/json"
- and not available_tweet_text[i]
- ):
- return_none_count += 1
-
- display_tweet_header()
- st.code(parsed_text_json)
-
- st.divider()
-
- progress.write(
- f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
- )
- except IndexError:
- if start_index <= 0:
- st.session_state.prev_disabled = True
- else:
- st.session_state.prev_disabled = False
-
- st.session_state.next_disabled = True
-
- prev, _, next = st.columns([3, 4, 3])
-
- prev.button(
- "Previous",
- disabled=st.session_state.prev_disabled,
- key="prev_button_key",
- on_click=prev_page,
- type="primary",
- use_container_width=True,
- )
- next.button(
- "Next",
- disabled=st.session_state.next_disabled,
- key="next_button_key",
- on_click=next_page,
- type="primary",
- use_container_width=True,
+ # -- CSV -- #
+
+ st.header("CSV", divider="gray")
+ st.write(
+ "Check the data returned in the dataframe below and download the file."
)
- if not wayback_tweets:
- st.error(
- "Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
+ col7, col8 = st.columns([1, 18])
+
+ with col7:
+ st.image(DOWNLOAD, width=22)
+
+ with col8:
+ b64_csv = base64.b64encode(csv_data.encode()).decode()
+ href_csv = f"data:file/csv;base64,{b64_csv}"
+
+ st.markdown(
+ f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
+ unsafe_allow_html=True,
+ )
+
+ st.dataframe(df, use_container_width=True)
+
+ # -- JSON -- #
+
+ st.header("JSON", divider="gray")
+ st.write("Check the data returned in JSON format below and download the file.")
+
+ col9, col10 = st.columns([1, 18])
+
+ with col9:
+ st.image(DOWNLOAD, width=22)
+
+ with col10:
+ b64_json = base64.b64encode(json_data.encode()).decode()
+ href_json = f"data:file/json;base64,{b64_json}"
+
+ st.markdown(
+ f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
+ unsafe_allow_html=True,
)
+
+ st.json(json_data, expanded=False)
except TypeError as e:
st.error(
f"""
If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501
)
- st.session_state.offset = 0
+ st.stop()
except Exception as e:
- st.error(f"{e}")
+ st.error(str(e))
st.stop()
-requests>=2.30.0
streamlit==1.35.0
waybacktweets>=1.0
--- /dev/null
+<svg version="1.2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 170 272" width="170" height="272">
+ <title>upload-svg</title>
+ <style>
+ .s0 { opacity: .9;fill: none;stroke: #000000;stroke-linecap: round;stroke-linejoin: round;stroke-width: 16 }
+ </style>
+ <path id="Layer" fill-rule="evenodd" class="s0" d="m15 258q35.5-3.7 70.7-3.7c35.3 0 63 3.7 70.3 3.7"/>
+ <path id="Layer" fill-rule="evenodd" class="s0" d="m84.3 164.8q-1.6-37.3-1.6-74.5c0-37.1 1.6-66.3 1.6-74"/>
+ <path id="Layer" fill-rule="evenodd" class="s0" d="m150 134c-4.7 19.7-52.6 64.5-57.6 76.4-5 11.8-62.3-39.4-72.4-62.9"/>
+</svg>
\ No newline at end of file
--- /dev/null
+[theme]
+base = "light"
+primaryColor = "#ab2e33"
+secondaryBackgroundColor = "#efefef"
+textColor = "#000000"
+backgroundColor = "#f9f9f9"
+font = "serif"
+
+[client]
+displayEnabled = true
+toolbarMode = "minimal"
--- /dev/null
+import datetime
+import re
+from urllib.parse import unquote
+
+import requests
+import streamlit as st
+import streamlit.components.v1 as components
+
+year = datetime.datetime.now().year
+
+st.set_page_config(
+ page_title="Wayback Tweets",
+ page_icon="🏛️",
+ layout="centered",
+ menu_items={
+ "About": """
+ ## 🏛️ Wayback Tweets
+
+ Tool that displays, via Wayback CDX Server API, multiple archived tweets on Wayback Machine to avoid opening each link manually. Users can apply filters based on specific years and view tweets that do not have the original URL available.
+
+ This tool is a prototype, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues). Created by [@claromes](https://claromes.com).
+
+ -------
+ """, # noqa: E501
+ },
+)
+
+# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
+hide_streamlit_style = """
+<style>
+ header[data-testid="stHeader"] {
+ opacity: 0.5;
+ }
+ iframe {
+ border: 1px solid #dddddd;
+ border-radius: 0.5rem;
+ }
+ div[data-testid="InputInstructions"] {
+ visibility: hidden;
+ }
+</style>
+"""
+
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+
+if "current_handle" not in st.session_state:
+ st.session_state.current_handle = ""
+
+if "prev_disabled" not in st.session_state:
+ st.session_state.prev_disabled = False
+
+if "next_disabled" not in st.session_state:
+ st.session_state.next_disabled = False
+
+if "next_button" not in st.session_state:
+ st.session_state.next_button = False
+
+if "prev_button" not in st.session_state:
+ st.session_state.prev_button = False
+
+if "update_component" not in st.session_state:
+ st.session_state.update_component = 0
+
+if "offset" not in st.session_state:
+ st.session_state.offset = 0
+
+if "saved_at" not in st.session_state:
+ st.session_state.saved_at = (2006, year)
+
+if "count" not in st.session_state:
+ st.session_state.count = False
+
+
+def scroll_into_view():
+ js = f"""
+ <script>
+ window.parent.document.querySelector('section.main').scrollTo(0, 0);
+ let update_component = {st.session_state.update_component} // Force component update to generate scroll
+ </script>
+ """ # noqa: E501
+
+ components.html(js, width=0, height=0)
+
+
+def clean_tweet(tweet):
+ handle = st.session_state.current_handle.lower()
+ tweet_lower = tweet.lower()
+
+ pattern = re.compile(r"/status/(\d+)")
+ match_lower_case = pattern.search(tweet_lower)
+ match_original_case = pattern.search(tweet)
+
+ if match_lower_case and handle in tweet_lower:
+ return f"https://twitter.com/{st.session_state.current_handle}/status/{match_original_case.group(1)}" # noqa: E501
+ else:
+ return tweet
+
+
+def clean_link(link):
+ handle = st.session_state.current_handle.lower()
+ link = link.lower()
+
+ pattern = re.compile(r"/status/(\d+)")
+ match = pattern.search(link)
+
+ if match and handle in link:
+ return f"https://web.archive.org/web/{timestamp[i]}/https://twitter.com/{st.session_state.current_handle}/status/{match.group(1)}" # noqa: E501
+ else:
+ return link
+
+
+def pattern_tweet(tweet):
+ # Reply: /status//
+ # Link: /status///
+ # Twimg: /status/https://pbs
+
+ pattern = re.compile(r'/status/"([^"]+)"')
+
+ match = pattern.search(tweet)
+ if match:
+ return match.group(1).lstrip("/")
+ else:
+ return tweet
+
+
+def pattern_tweet_id(tweet):
+ # Delete sub-endpoint (/photos, /likes, /retweet...)
+ pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
+ match_username = pattern_username.match(tweet)
+
+ pattern_id = r"https://twitter.com/\w+/status/(\d+)"
+ match_id = re.search(pattern_id, tweet)
+
+ if match_id and match_username:
+ tweet_id = match_id.group(1)
+ username = match_username.group(1)
+ return f"https://twitter.com/{username}/status/{tweet_id}"
+ else:
+ return tweet
+
+
+def check_double_status(url_wb, url_tweet):
+ if url_wb.count("/status/") == 2 and "twitter.com" not in url_tweet:
+ return True
+
+ return False
+
+
+def embed(tweet):
+ try:
+ url = f"https://publish.twitter.com/oembed?url={clean_tweet(tweet)}"
+ response = requests.get(url)
+
+ regex = r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>' # noqa: E501
+ regex_author = r"^(.*?)\s*\("
+
+ if response.status_code == 200 or response.status_code == 302:
+ status_code = response.status_code
+ html = response.json()["html"]
+ author_name = response.json()["author_name"]
+
+ matches_html = re.findall(regex, html, re.DOTALL)
+
+ tweet_content = []
+ user_info = []
+ is_RT = []
+
+ for match in matches_html:
+ tweet_content_match = re.sub(r"<a[^>]*>|<\/a>", "", match[0].strip())
+ tweet_content_match = tweet_content_match.replace("<br>", "\n")
+
+ user_info_match = re.sub(r"<a[^>]*>|<\/a>", "", match[1].strip())
+ user_info_match = user_info_match.replace(")", "), ")
+
+ match_author = re.search(regex_author, user_info_match)
+ author_tweet = match_author.group(1)
+
+ if tweet_content_match:
+ tweet_content.append(tweet_content_match)
+ if user_info_match:
+ user_info.append(user_info_match)
+
+ is_RT_match = False
+ if author_name != author_tweet:
+ is_RT_match = True
+
+ is_RT.append(is_RT_match)
+
+ return status_code, tweet_content, user_info, is_RT
+ else:
+ return False
+ except requests.exceptions.Timeout:
+ st.error("Connection to web.archive.org timed out.")
+ except requests.exceptions.ConnectionError:
+ st.error("Failed to establish a new connection with web.archive.org.")
+ except UnboundLocalError:
+ st.empty()
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def tweets_count(handle, saved_at):
+ url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&from={saved_at[0]}&to={saved_at[1]}" # noqa: E501
+ try:
+ response = requests.get(url)
+
+ if response.status_code == 200:
+ data = response.json()
+ if data and len(data) > 1:
+ total_tweets = len(data) - 1
+ return total_tweets
+ else:
+ return 0
+ except requests.exceptions.Timeout:
+ st.error("Connection to web.archive.org timed out.")
+ st.stop()
+ except requests.exceptions.ConnectionError:
+ st.error("Failed to establish a new connection with web.archive.org.")
+ except UnboundLocalError:
+ st.empty()
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def query_api(handle, limit, offset, saved_at):
+ if not handle:
+ st.warning("username, please!")
+ st.stop()
+
+ url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&limit={limit}&offset={offset}&from={saved_at[0]}&to={saved_at[1]}" # noqa: E501
+ try:
+ response = requests.get(url)
+ response.raise_for_status()
+
+ if response.status_code == 200 or response.status_code == 304:
+ return response.json()
+ except requests.exceptions.Timeout:
+ st.error("Connection to web.archive.org timed out.")
+ except requests.exceptions.ConnectionError:
+ st.error("Failed to establish a new connection with web.archive.org.")
+ except UnboundLocalError:
+ st.empty()
+ except requests.exceptions.HTTPError:
+ st.error(
+ """
+ **Temporarily Offline**
+
+ Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information.
+ """ # noqa: E501
+ )
+ st.stop()
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def parse_links(links):
+ parsed_links = []
+ timestamp = []
+ tweet_links = []
+ parsed_mimetype = []
+
+ for link in links[1:]:
+ tweet_remove_char = unquote(link[2]).replace("’", "")
+ cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"')
+
+ url = f"https://web.archive.org/web/{link[1]}/{tweet_remove_char}"
+
+ parsed_links.append(url)
+ timestamp.append(link[1])
+ tweet_links.append(cleaned_tweet)
+ parsed_mimetype.append(link[3])
+
+ return parsed_links, tweet_links, parsed_mimetype, timestamp
+
+
+def attr(i):
+ original_tweet = pattern_tweet_id(clean_tweet(tweet_links[i]))
+
+ if status:
+ original_tweet = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
+ elif "://" not in tweet_links[i]:
+ original_tweet = pattern_tweet_id(f"https://{tweet_links[i]}")
+
+ st.markdown(
+ f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}' # noqa: E501
+ )
+
+
+def display_tweet():
+ if (
+ mimetype[i] == "application/json"
+ or mimetype[i] == "text/html"
+ or mimetype[i] == "unk"
+ or mimetype[i] == "warc/revisit"
+ ):
+ if is_RT[0] is True:
+ st.info("*Retweet*")
+ st.write(tweet_content[0])
+ st.write(f"**{user_info[0]}**")
+
+ st.divider()
+ else:
+ st.warning("MIME Type was not parsed.")
+
+ st.divider()
+
+
+def display_not_tweet():
+ original_link = pattern_tweet_id(clean_tweet(tweet_links[i]))
+
+ if status:
+ original_link = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
+ elif "://" not in tweet_links[i]:
+ original_link = pattern_tweet_id(f"https://{tweet_links[i]}")
+
+ response_html = requests.get(original_link)
+
+ if (
+ mimetype[i] == "text/html"
+ or mimetype[i] == "warc/revisit"
+ or mimetype[i] == "unk"
+ ):
+ if (
+ ".jpg" in tweet_links[i] or ".png" in tweet_links[i]
+ ) and response_html.status_code == 200:
+ components.iframe(tweet_links[i], height=500, scrolling=True)
+ elif "/status/" not in original_link:
+ st.info("This isn't a status or is not available")
+ elif status or f"{st.session_state.current_handle}" not in original_link:
+ st.info(f"Replying to {st.session_state.current_handle}")
+ else:
+ components.iframe(clean_link(link), height=500, scrolling=True)
+
+ st.divider()
+ elif mimetype[i] == "application/json":
+ try:
+ response_json = requests.get(link)
+
+ if response_json.status_code == 200:
+ json_data = response_json.json()
+
+ if "data" in json_data:
+ if "text" in json_data["data"]:
+ json_text = json_data["data"]["text"]
+ else:
+ json_text = json_data["data"]
+ else:
+ if "text" in json_data:
+ json_text = json_data["text"]
+ else:
+ json_text = json_data
+
+ st.code(json_text)
+ st.json(json_data, expanded=False)
+
+ st.divider()
+ else:
+ st.error(response_json.status_code)
+
+ st.divider()
+ except requests.exceptions.Timeout:
+ st.error("Connection to web.archive.org timed out.")
+ st.divider()
+ except requests.exceptions.ConnectionError:
+ st.error("Failed to establish a new connection with web.archive.org.")
+ st.divider()
+ except UnboundLocalError:
+ st.empty()
+ else:
+ st.warning("MIME Type was not parsed.")
+ st.divider()
+
+
+def prev_page():
+ st.session_state.offset -= tweets_per_page
+
+ # scroll to top config
+ st.session_state.update_component += 1
+ scroll_into_view()
+
+
+def next_page():
+ st.session_state.offset += tweets_per_page
+
+ # scroll to top config
+ st.session_state.update_component += 1
+ scroll_into_view()
+
+
+# UI
+st.title(
+ "Wayback Tweets [](https://github.com/claromes/waybacktweets)", # noqa: E501
+ anchor=False,
+ help="v0.4.3",
+)
+st.write(
+ "Display multiple archived tweets on Wayback Machine and avoid opening each link manually" # noqa: E501
+)
+
+handle = st.text_input("Username", placeholder="jack")
+
+st.session_state.saved_at = st.slider("Tweets saved between", 2006, year, (2006, year))
+
+not_available = st.checkbox(
+ "Original URLs not available",
+ help="Due to changes in X, it is possible to find available tweets if you are logged into X", # noqa: E501
+)
+
+query = st.button("Query", type="primary", use_container_width=True)
+
+if handle != st.session_state.current_handle:
+ st.session_state.current_handle = handle
+ st.session_state.offset = 0
+
+if query or st.session_state.count:
+ tweets_per_page = 25
+
+ st.session_state.count = tweets_count(handle, st.session_state.saved_at)
+
+ st.caption(
+ "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
+ )
+ st.write(f"**{st.session_state.count} URLs have been captured**")
+
+ if st.session_state.count:
+ if tweets_per_page > st.session_state.count:
+ tweets_per_page = st.session_state.count
+
+ try:
+ progress = st.empty()
+ links = query_api(
+ handle, tweets_per_page, st.session_state.offset, st.session_state.saved_at
+ )
+
+ parse = parse_links(links)
+ parsed_links = parse[0]
+ tweet_links = parse[1]
+ mimetype = parse[2]
+ timestamp = parse[3]
+
+ if links:
+ st.divider()
+
+ st.session_state.current_handle = handle
+
+ return_none_count = 0
+
+ start_index = st.session_state.offset
+ end_index = min(st.session_state.count, start_index + tweets_per_page)
+
+ with st.spinner("Fetching..."):
+ for i in range(tweets_per_page):
+ try:
+ if tweet_links[i]:
+ link = parsed_links[i]
+ tweet = embed(tweet_links[i])
+
+ status = check_double_status(link, tweet_links[i])
+
+ if not not_available:
+ attr(i)
+
+ if tweet:
+ status_code = tweet[0]
+ tweet_content = tweet[1]
+ user_info = tweet[2]
+ is_RT = tweet[3]
+
+ display_tweet()
+ elif not tweet:
+ display_not_tweet()
+
+ if not_available:
+ if not tweet:
+ return_none_count += 1
+ attr(i)
+
+ display_not_tweet()
+
+ progress.write(
+ f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
+ )
+
+ if start_index <= 0:
+ st.session_state.prev_disabled = True
+ else:
+ st.session_state.prev_disabled = False
+
+ if i + 1 == st.session_state.count:
+ st.session_state.next_disabled = True
+ else:
+ st.session_state.next_disabled = False
+ except IndexError:
+ if start_index <= 0:
+ st.session_state.prev_disabled = True
+ else:
+ st.session_state.prev_disabled = False
+
+ st.session_state.next_disabled = True
+
+ prev, _, next = st.columns([3, 4, 3])
+
+ prev.button(
+ "Previous",
+ disabled=st.session_state.prev_disabled,
+ key="prev_button_key",
+ on_click=prev_page,
+ type="primary",
+ use_container_width=True,
+ )
+ next.button(
+ "Next",
+ disabled=st.session_state.next_disabled,
+ key="next_button_key",
+ on_click=next_page,
+ type="primary",
+ use_container_width=True,
+ )
+
+ if not links:
+ st.error("Unable to query the Wayback Machine API.")
+ except TypeError as e:
+ st.error(
+ f"""
+ {e}. Refresh this page and try again.
+ """ # noqa: E501
+ )
+ st.session_state.offset = 0
--- /dev/null
+requests==2.30.0
+streamlit==1.27.0
[[package]]
name = "filelock"
-version = "3.15.3"
+version = "3.15.4"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.8"
files = [
- {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"},
- {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"},
+ {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"},
+ {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"},
]
[package.extras]
[[package]]
name = "sphinx-autodoc-typehints"
-version = "2.2.1"
+version = "2.2.2"
description = "Type hints (PEP 484) support for the Sphinx autodoc extension"
optional = false
python-versions = ">=3.9"
files = [
- {file = "sphinx_autodoc_typehints-2.2.1-py3-none-any.whl", hash = "sha256:ac37852861c58a5ca95be13d5a0f49f3661b5341eaf7de8531842135600aeb90"},
- {file = "sphinx_autodoc_typehints-2.2.1.tar.gz", hash = "sha256:26a81e6444c9b82a952519a3b7c52e45f14a0f81c91cfc7063cfcf2ca109d161"},
+ {file = "sphinx_autodoc_typehints-2.2.2-py3-none-any.whl", hash = "sha256:b98337a8530c95b73ba0c65465847a8ab0a13403bdc81294d5ef396bbd1f783e"},
+ {file = "sphinx_autodoc_typehints-2.2.2.tar.gz", hash = "sha256:128e600eeef63b722f3d8dac6403594592c8cade3ba66fd11dcb997465ee259d"},
]
[package.dependencies]
html_file_path = f"{self.filename}.html"
- html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username)
+ html = HTMLTweetsVisualizer(self.username, json_file_path, html_file_path)
html_content = html.generate()
html.save(html_content)
""" # noqa: E501
url = "https://web.archive.org/cdx/search/cdx"
- status_pathname = "status/*"
+ wildcard_pathname = "/*"
if self.matchtype:
- status_pathname = ""
+ wildcard_pathname = ""
params = {
- "url": f"https://twitter.com/{self.username}/{status_pathname}",
+ "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
"output": "json",
}
"""
import json
-from typing import Any, Dict, List
+import os
+from typing import Any, Dict, List, Union
from waybacktweets.utils import timestamp_parser
Class responsible for generating an HTML file to visualize the parsed data.
Args:
- json_file_path (str): The path of the JSON file.
- html_file_path (str): The path where the HTML file will be saved.
username (str): The username associated with the tweets.
+ json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
+ html_file_path (str, optional): The path where the HTML file will be saved.
"""
- def __init__(self, json_file_path: str, html_file_path: str, username: str):
+ def __init__(
+ self,
+ username: str,
+ json_file_path: Union[str, List[str]],
+ html_file_path: str = None,
+ ):
+ self.username = username
self.json_file_path = self._json_loader(json_file_path)
self.html_file_path = html_file_path
- self.username = username
@staticmethod
- def _json_loader(json_file_path: str) -> List[Dict[str, Any]]:
+ def _json_loader(json_file_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
"""
- Reads and loads JSON data from a specified file path.
+ Reads and loads JSON data from a specified file path or JSON string.
Args:
- json_file_path (str): The path of the JSON file.
+ json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
Returns:
- The content of the JSON file.
+ The content of the JSON file or data.
"""
- with open(json_file_path, "r", encoding="utf-8") as f:
- return json.load(f)
+ if os.path.isfile(json_file_path):
+ with open(json_file_path, "r", encoding="utf-8") as f:
+ return json.load(f)
+
+ return json.loads(json_file_path)
def generate(self) -> str:
"""
html = f"<html>\n<!-- This content was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
html += f"\n<head>\n<title>@{self.username}'s archived tweets</title>\n"
html += "<style>\n"
- html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
+ html += "body { font-family: monospace; background-color: whitesmoke; color: #1c1e21; margin: 0; padding: 20px; }\n"
html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
- html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
+ html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e2e2e2; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
html += ".tweet strong { font-weight: bold; }\n"
html += ".tweet a { color: #000000; text-decoration: none; }\n"
html += ".content { color: #000000; }\n"
for index, tweet in enumerate(self.json_file_path):
html += '<div class="tweet">\n'
- if (
- tweet["archived_mimetype"] != "application/json"
- and not tweet["available_tweet_text"]
- ):
+ if not tweet["available_tweet_text"]:
iframe_src = {
"Archived Tweet": tweet["archived_tweet_url"],
"Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
"Parsed Tweet": tweet["parsed_tweet_url"],
}
+ html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
+ html += f'<p class="source">{tweet["archived_mimetype"]}</p>\n'
+ html += "<br>\n"
+
for key, value in iframe_src.items():
key_cleaned = key.replace(" ", "_")
index=index, url=value, key_cleaned=key_cleaned
)
- html += "<br>\n"
- html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
-
if tweet["available_tweet_text"]:
html += "<br>\n"
html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
html += "<br>\n"
html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
- html += f'<p><strong>Archived mimetype: {tweet["archived_mimetype"]}</strong></p>\n'
+ html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
html += (
f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'