import base64
-from datetime import datetime
+from datetime import datetime, timedelta
import streamlit as st
PAGE_ICON = "assets/parthenon.png"
TITLE = "assets/waybacktweets.png"
-PREVIEW_IMAGE = "assets/preview_image.jpg"
DOWNLOAD = "assets/download.svg"
collapse = None
matchtype = None
-start_date = datetime(2006, 1, 1)
+start_date = datetime.now() - timedelta(days=365 * 2)
end_date = datetime.now()
+min_date = datetime(2006, 1, 1)
# ------ Verbose Mode Configuration ------ #
# ------ Requestings ------ #
-@st.cache_data(ttl=600, show_spinner=True)
+@st.cache_data(ttl=600, show_spinner=False)
def wayback_tweets(
username,
collapse,
return archived_tweets
-@st.cache_data(ttl=600, show_spinner=True)
+@st.cache_data(ttl=600, show_spinner=False)
def tweets_parser(archived_tweets, username, field_options):
parser = TweetsParser(archived_tweets, username, field_options)
parsed_tweets = parser.parse()
return parsed_tweets
-@st.cache_data(ttl=600, show_spinner=True)
+@st.cache_data(ttl=600, show_spinner=False)
def tweets_exporter(parsed_tweets, username, field_options):
exporter = TweetsExporter(parsed_tweets, username, field_options)
)
st.write("Retrieve archived tweets CDX data in CSV, JSON, and HTML formats.")
-st.caption(
+st.write(
"This application uses the Wayback Tweets Python package, which can be used as a module or as a standalone command line tool. [Read the documentation](https://claromes.github.io/waybacktweets)." # noqa: E501
)
-st.caption(
+st.write(
"To access the legacy version of Wayback Tweets [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
)
username = st.text_input("Username *", key="username", placeholder="Without @")
with st.expander("Filtering"):
- start_date = datetime(2006, 1, 1)
- end_date = datetime.now()
+ st.caption(
+ ":orange[A large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]" # noqa: E501
+ )
st.session_state.archived_timestamp_filter = st.date_input(
"Tweets saved between",
(start_date, end_date),
- start_date,
+ min_date,
end_date,
format="YYYY/MM/DD",
help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
help="Allows for a simple way to scroll through the results",
)
- col3, col4 = st.columns(2)
-
- with col3:
- not_available = st.checkbox(
- "Only tweets not available",
- key="not_available",
- help="Checks if the archived URL still exists on Twitter",
- )
-
- with col4:
- unique = st.checkbox(
- "Only unique Wayback Machine URLs",
- key="unique",
- help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
- )
+ unique = st.checkbox(
+ "Only unique Wayback Machine URLs",
+ key="unique",
+ help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
+ )
query = st.button("Query", type="primary", use_container_width=True)
matchtype = "prefix"
try:
- wayback_tweets = wayback_tweets(
- st.session_state.current_username,
- collapse,
- st.session_state.archived_timestamp_filter[0],
- st.session_state.archived_timestamp_filter[1],
- limit,
- offset,
- matchtype,
- )
+ with st.spinner(
+ f"Waybacking @{st.session_state.current_username}'s archived tweets"
+ ):
+ wayback_tweets = wayback_tweets(
+ st.session_state.current_username,
+ collapse,
+ st.session_state.archived_timestamp_filter[0],
+ st.session_state.archived_timestamp_filter[1],
+ limit,
+ offset,
+ matchtype,
+ )
if not wayback_tweets:
st.error("No data was saved due to an empty response.")
st.stop()
- parsed_tweets = tweets_parser(
- wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
- )
+ with st.spinner(
+ f"Parsing @{st.session_state.current_username}'s archived tweets"
+ ):
+ parsed_tweets = tweets_parser(
+ wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
+ )
- df, file_name = tweets_exporter(
- parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
- )
+ df, file_name = tweets_exporter(
+ parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
+ )
csv_data = df.to_csv(index=False)
json_data = df.to_json(orient="records", lines=False)
html = HTMLTweetsVisualizer(username, json_data)
html_content = html.generate()
- st.session_state.count = len(df)
- st.write(f"**{st.session_state.count} URLs have been captured**")
+ # -- Rendering -- #
- # -- HTML -- #
+ if csv_data and json_data and html_content:
+ st.session_state.count = len(df)
+ st.write(f"**{st.session_state.count} URLs have been captured**")
- st.header("HTML", divider="gray")
- st.write(
- f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
- )
+ # -- HTML -- #
- col5, col6 = st.columns([1, 18])
+ st.header("HTML", divider="gray")
+ st.write(
+ f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
+ )
- with col5:
- st.image(DOWNLOAD, width=22)
+ col5, col6 = st.columns([1, 18])
- with col6:
- b64_html = base64.b64encode(html_content.encode()).decode()
- href_html = f"data:text/html;base64,{b64_html}"
+ with col5:
+ st.image(DOWNLOAD, width=22)
- st.markdown(
- f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
- unsafe_allow_html=True,
- )
+ with col6:
+ b64_html = base64.b64encode(html_content.encode()).decode()
+ href_html = f"data:text/html;base64,{b64_html}"
- st.image(PREVIEW_IMAGE, "Preview image")
+ st.markdown(
+ f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
+ unsafe_allow_html=True,
+ )
- # -- CSV -- #
+ # -- CSV -- #
- st.header("CSV", divider="gray")
- st.write(
- "Check the data returned in the dataframe below and download the file."
- )
+ st.header("CSV", divider="gray")
+ st.write(
+ "Check the data returned in the dataframe below and download the file."
+ )
- col7, col8 = st.columns([1, 18])
+ col7, col8 = st.columns([1, 18])
- with col7:
- st.image(DOWNLOAD, width=22)
+ with col7:
+ st.image(DOWNLOAD, width=22)
- with col8:
- b64_csv = base64.b64encode(csv_data.encode()).decode()
- href_csv = f"data:file/csv;base64,{b64_csv}"
+ with col8:
+ b64_csv = base64.b64encode(csv_data.encode()).decode()
+ href_csv = f"data:file/csv;base64,{b64_csv}"
- st.markdown(
- f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
- unsafe_allow_html=True,
- )
+ st.markdown(
+ f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
+ unsafe_allow_html=True,
+ )
- st.dataframe(df, use_container_width=True)
+ st.dataframe(df, use_container_width=True)
- # -- JSON -- #
+ # -- JSON -- #
- st.header("JSON", divider="gray")
- st.write("Check the data returned in JSON format below and download the file.")
+ st.header("JSON", divider="gray")
+ st.write(
+ "Check the data returned in JSON format below and download the file."
+ )
- col9, col10 = st.columns([1, 18])
+ col9, col10 = st.columns([1, 18])
- with col9:
- st.image(DOWNLOAD, width=22)
+ with col9:
+ st.image(DOWNLOAD, width=22)
- with col10:
- b64_json = base64.b64encode(json_data.encode()).decode()
- href_json = f"data:file/json;base64,{b64_json}"
+ with col10:
+ b64_json = base64.b64encode(json_data.encode()).decode()
+ href_json = f"data:file/json;base64,{b64_json}"
- st.markdown(
- f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
- unsafe_allow_html=True,
- )
+ st.markdown(
+ f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
+ unsafe_allow_html=True,
+ )
- st.json(json_data, expanded=False)
+ st.json(json_data, expanded=False)
except TypeError as e:
st.error(
f"""
Args:
username (str): The username associated with the tweets.
- json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
+ json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
html_file_path (str, optional): The path where the HTML file will be saved.
"""
def __init__(
self,
username: str,
- json_file_path: Union[str, List[str]],
+ json_path: Union[str, List[str]],
html_file_path: str = None,
):
self.username = username
- self.json_file_path = self._json_loader(json_file_path)
+ self.json_path = self._json_loader(json_path)
self.html_file_path = html_file_path
@staticmethod
- def _json_loader(json_file_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
+ def _json_loader(json_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
"""
Reads and loads JSON data from a specified file path or JSON string.
Args:
- json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
+ json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
Returns:
The content of the JSON file or data.
"""
- if os.path.isfile(json_file_path):
- with open(json_file_path, "r", encoding="utf-8") as f:
+ if os.path.isfile(json_path):
+ with open(json_path, "r", encoding="utf-8") as f:
return json.load(f)
- return json.loads(json_file_path)
+ return json.loads(json_path)
def generate(self) -> str:
"""
html += f"<h1>@{self.username}'s archived tweets</h1>\n"
html += '<div class="container">\n'
- for index, tweet in enumerate(self.json_file_path):
+ for index, tweet in enumerate(self.json_path):
html += '<div class="tweet">\n'
if not tweet["available_tweet_text"]:
"Parsed Tweet": tweet["parsed_tweet_url"],
}
- html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
- html += f'<p class="source">{tweet["archived_mimetype"]}</p>\n'
- html += "<br>\n"
-
for key, value in iframe_src.items():
key_cleaned = key.replace(" ", "_")
html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
html += "<br>\n"
+ html += f'<p><strong>Archived Tweet:</strong> {tweet["archived_tweet_url"]}</p>\n'
+ html += f'<p><strong>Parsed Archived Tweet:</strong> {tweet["parsed_archived_tweet_url"]}</p>\n'
+ html += f'<p><strong>Original Tweet:</strong> {tweet["original_tweet_url"]}</p>\n'
+ html += (
+ f'<p><strong>Parsed Tweet:</strong> {tweet["parsed_tweet_url"]}</p>\n'
+ )
html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'