--- /dev/null
+import datetime
+
+import requests
+import streamlit as st
+import streamlit.components.v1 as components
+
+from waybacktweets.export_tweets import TweetsExporter
+from waybacktweets.parse_tweets import TweetsParser
+from waybacktweets.request_tweets import WaybackTweets
+from waybacktweets.utils import check_double_status, get_response
+
+# Initial Settings
+
+LOGO = "app/assets/parthenon.svg"
+
+st.set_page_config(
+ page_title="Wayback Tweets",
+ page_icon=LOGO,
+ layout="centered",
+ menu_items={
+ "Report a bug": "https://github.com/claromes/waybacktweets/issues",
+ },
+)
+
+# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
+st.html(
+ """
+<style>
+ header[data-testid="stHeader"] {
+ opacity: 0.5;
+ }
+ iframe {
+ border: 1px solid #dddddd;
+ border-radius: 0.5rem;
+ }
+ div[data-testid="InputInstructions"] {
+ visibility: hidden;
+ }
+ img[data-testid="stLogo"] {
+ scale: 3;
+ padding-left: 10px;
+ }
+</style>
+"""
+)
+
+if "current_username" not in st.session_state:
+ st.session_state.current_username = ""
+
+if "prev_disabled" not in st.session_state:
+ st.session_state.prev_disabled = False
+
+if "next_disabled" not in st.session_state:
+ st.session_state.next_disabled = False
+
+if "next_button" not in st.session_state:
+ st.session_state.next_button = False
+
+if "prev_button" not in st.session_state:
+ st.session_state.prev_button = False
+
+if "update_component" not in st.session_state:
+ st.session_state.update_component = 0
+
+if "offset" not in st.session_state:
+ st.session_state.offset = 0
+
+if "count" not in st.session_state:
+ st.session_state.count = False
+
+start_date = datetime.datetime(2006, 3, 1)
+end_date = datetime.datetime.now()
+
+if "archived_timestamp_filter" not in st.session_state:
+ st.session_state.archived_timestamp_filter = (start_date, end_date)
+
+
+# Pagination Settings
+
+
+def scroll_into_view():
+ script = f"""
+ <script>
+ window.parent.document.querySelector('section.main').scrollTo(0, 0);
+ let update_component = {st.session_state.update_component}
+ </script>
+ """
+
+ components.html(script, width=0, height=0)
+
+
+def prev_page():
+ st.session_state.offset -= tweets_per_page
+
+ st.session_state.update_component += 1
+ scroll_into_view()
+
+
+def next_page():
+ st.session_state.offset += tweets_per_page
+
+ st.session_state.update_component += 1
+ scroll_into_view()
+
+
+# Requesting
+
+
+@st.cache_data(ttl=1800, show_spinner=False)
+def tweets_count(username, archived_timestamp_filter):
+ url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501
+
+ try:
+ response = get_response(url=url)
+
+ if response.status_code == 200:
+ data = response.json()
+ if data and len(data) > 1:
+ total_tweets = len(data) - 1
+ return total_tweets
+ else:
+ return 0
+ except requests.exceptions.Timeout:
+ st.error("Connection to web.archive.org timed out.")
+ st.stop()
+ except requests.exceptions.ConnectionError:
+ st.error("Failed to establish a new connection with web.archive.org.")
+ st.stop()
+ except UnboundLocalError:
+ st.empty()
+
+
+# Interface Settings
+
+
+st.logo(LOGO)
+
+st.success(
+ """**New Feature: CLI**
+
+You can now retrieve archived tweets using the Wayback Tweets command line tool.
+Download the archived tweets' CDX data in CSV, JSON, and HTML formats.
+
+For more details, [read the documentation](https://github.com/claromes/waybacktweets).""" # noqa: E501
+)
+
+st.title(
+ "Wayback Tweets",
+ anchor=False,
+)
+st.caption(
+ "[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)" # noqa: E501
+)
+st.caption("Display multiple archived tweets on Wayback Machine")
+
+username = st.text_input("Username", placeholder="Without @")
+
+start_date = datetime.datetime(2006, 3, 1)
+end_date = datetime.datetime.now()
+
+st.session_state.archived_timestamp_filter = st.date_input(
+ "Tweets saved between",
+ (start_date, end_date),
+ start_date,
+ end_date,
+ format="YYYY/MM/DD",
+ help="YYYY/MM/DD",
+)
+
+not_available = st.checkbox("Only tweets not available")
+
+unique = st.checkbox(
+ "Only unique URLs",
+ help="Filtering by the collapse option using the urlkey field",
+)
+
+query = st.button("Query", type="primary", use_container_width=True)
+
+# Tweet Listing Settings
+
+
+if username != st.session_state.current_username:
+ st.session_state.current_username = username
+ st.session_state.offset = 0
+
+if query or st.session_state.count:
+ tweets_per_page = 25
+
+ st.session_state.count = tweets_count(
+ username, st.session_state.archived_timestamp_filter
+ )
+
+ st.caption(
+ "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
+ )
+ st.write(f"**{st.session_state.count} URLs have been captured**")
+
+ if st.session_state.count:
+ if tweets_per_page > st.session_state.count:
+ tweets_per_page = st.session_state.count
+
+ try:
+ progress = st.empty()
+
+ # Tweet Listing Processing
+
+ response = WaybackTweets(
+ username,
+ unique,
+ st.session_state.archived_timestamp_filter[0],
+ st.session_state.archived_timestamp_filter[1],
+ tweets_per_page,
+ )
+ archived_tweets = response.get()
+
+ with st.spinner("Parsing..."):
+ if archived_tweets:
+ field_options = [
+ "archived_urlkey",
+ "archived_timestamp",
+ "original_tweet_url",
+ "archived_tweet_url",
+ "parsed_tweet_url",
+ "parsed_archived_tweet_url",
+ "parsed_tweet_text_mimetype_json",
+ "available_tweet_text",
+ "available_tweet_is_RT",
+ "available_tweet_info",
+ "archived_mimetype",
+ "archived_statuscode",
+ ]
+
+ parser = TweetsParser(archived_tweets, username, field_options)
+ parsed_tweets = parser.parse()
+
+ exporter = TweetsExporter(parsed_tweets, username, field_options)
+ df = exporter.dataframe
+
+ # file_path = "claromes_tweets_20240610210338.csv"
+ # df = pd.read_csv(file_path)
+ # df = df.fillna("")
+
+ archived_urlkey = df["archived_urlkey"]
+ archived_timestamp = df["archived_timestamp"]
+ original_tweet_url = df["original_tweet_url"]
+ archived_tweet_url = df["archived_tweet_url"]
+ parsed_tweet_url = df["parsed_tweet_url"]
+ parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
+ parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"]
+ available_tweet_text = df["available_tweet_text"]
+ available_tweet_is_RT = df["available_tweet_is_RT"]
+ available_tweet_info = df["available_tweet_info"]
+ archived_mimetype = df["archived_mimetype"]
+ archived_statuscode = df["archived_statuscode"]
+
+ st.divider()
+
+ st.session_state.current_username = username
+
+ return_none_count = 0
+
+ start_index = st.session_state.offset
+ end_index = min(st.session_state.count, start_index + tweets_per_page)
+
+ for i in range(tweets_per_page):
+ try:
+ if original_tweet_url[i]:
+
+ # Display all tweets
+ if not not_available:
+ st.markdown(
+ f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
+ )
+
+ # Display available tweets
+ if available_tweet_text[i]:
+ if available_tweet_is_RT[i]:
+ st.info("*Retweet*")
+
+ st.write(available_tweet_text[i])
+ st.write(f"**{available_tweet_info[i]}**")
+
+ st.divider()
+
+ # Display tweets not available with text/html, unk, warc/revisit MIME type or application/json MIME type without parsed JSON text # noqa: E501
+ elif (
+ (
+ archived_mimetype[i] != "application/json"
+ and not parsed_tweet_text_mimetype_json[i]
+ )
+ and not available_tweet_text[i]
+ ) or (
+ (
+ archived_mimetype[i] == "application/json"
+ and not parsed_tweet_text_mimetype_json[i]
+ )
+ and not available_tweet_text[i]
+ ):
+ if (
+ ".jpg" in original_tweet_url[i]
+ or ".png" in original_tweet_url[i]
+ ) and (400 <= archived_statuscode[i] <= 511):
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
+ elif "/status/" not in original_tweet_url[i]:
+ st.info(
+ "This isn't a status or is not available"
+ )
+ elif (
+ check_double_status(
+ archived_tweet_url[i], original_tweet_url[i]
+ )
+ or f"{st.session_state.current_username}"
+ not in original_tweet_url[i]
+ ):
+ st.info(
+ f"Replying to {st.session_state.current_username}" # noqa: E501
+ )
+ else:
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
+
+ st.divider()
+
+ # Display tweets not available with application/json MIME type and parsed JSON text # noqa: E501
+ elif (
+ archived_mimetype[i] == "application/json"
+ and parsed_tweet_text_mimetype_json[i]
+ ) and not available_tweet_text[i]:
+ st.code(parsed_tweet_text_mimetype_json[i])
+ # st.json(json_data, expanded=False)
+
+ st.divider()
+
+ # Display only tweets not available
+ if not_available:
+ return_none_count += 1
+
+ st.markdown(
+ f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
+ )
+
+ # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
+ if (
+ archived_mimetype[i] != "application/json"
+ and not available_tweet_text[i]
+ ):
+ if (
+ ".jpg" in original_tweet_url[i]
+ or ".png" in original_tweet_url[i]
+ ) and (400 <= archived_statuscode[i] <= 511):
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
+ elif "/status/" not in original_tweet_url[i]:
+ st.info(
+ "This isn't a status or is not available"
+ )
+ elif (
+ check_double_status(
+ archived_tweet_url[i], original_tweet_url[i]
+ )
+ or f"{st.session_state.current_username}"
+ not in original_tweet_url[i]
+ ):
+ st.info(
+ f"Replying to {st.session_state.current_username}" # noqa: E501
+ )
+ else:
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
+
+ st.divider()
+
+ # Display tweets not available with application/json return # noqa: E501
+ elif (
+ archived_mimetype[i] == "application/json"
+ and not available_tweet_text[i]
+ ):
+ st.code(parsed_tweet_text_mimetype_json[i])
+ # st.json(json_data, expanded=False)
+
+ st.divider()
+
+ progress.write(
+ f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
+ )
+ pass
+ except IndexError:
+ if start_index <= 0:
+ st.session_state.prev_disabled = True
+ else:
+ st.session_state.prev_disabled = False
+
+ st.session_state.next_disabled = True
+
+ prev, _, next = st.columns([3, 4, 3])
+
+ prev.button(
+ "Previous",
+ disabled=st.session_state.prev_disabled,
+ key="prev_button_key",
+ on_click=prev_page,
+ type="primary",
+ use_container_width=True,
+ )
+ next.button(
+ "Next",
+ disabled=st.session_state.next_disabled,
+ key="next_button_key",
+ on_click=next_page,
+ type="primary",
+ use_container_width=True,
+ )
+
+ if not archived_tweets:
+ st.error("Unable to query the Wayback Machine API.")
+ except TypeError as e:
+ st.error(
+ f"""
+ {e}. Refresh this page and try again.
+
+ If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501
+ )
+ st.session_state.offset = 0
+++ /dev/null
-import datetime
-
-import requests
-import streamlit as st
-import streamlit.components.v1 as components
-
-from waybacktweets.export_tweets import TweetsExporter
-from waybacktweets.parse_tweets import TweetsParser
-from waybacktweets.request_tweets import WaybackTweets
-from waybacktweets.utils import check_double_status
-
-# Initial Settings
-
-LOGO = "app/assets/parthenon.svg"
-
-st.set_page_config(
- page_title="Wayback Tweets",
- page_icon=LOGO,
- layout="centered",
- menu_items={
- "Report a bug": "https://github.com/claromes/waybacktweets/issues",
- },
-)
-
-# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
-st.html(
- """
-<style>
- header[data-testid="stHeader"] {
- opacity: 0.5;
- }
- iframe {
- border: 1px solid #dddddd;
- border-radius: 0.5rem;
- }
- div[data-testid="InputInstructions"] {
- visibility: hidden;
- }
- img[data-testid="stLogo"] {
- scale: 3;
- padding-left: 10px;
- }
-</style>
-"""
-)
-
-if "current_username" not in st.session_state:
- st.session_state.current_username = ""
-
-if "prev_disabled" not in st.session_state:
- st.session_state.prev_disabled = False
-
-if "next_disabled" not in st.session_state:
- st.session_state.next_disabled = False
-
-if "next_button" not in st.session_state:
- st.session_state.next_button = False
-
-if "prev_button" not in st.session_state:
- st.session_state.prev_button = False
-
-if "update_component" not in st.session_state:
- st.session_state.update_component = 0
-
-if "offset" not in st.session_state:
- st.session_state.offset = 0
-
-if "count" not in st.session_state:
- st.session_state.count = False
-
-start_date = datetime.datetime(2006, 3, 1)
-end_date = datetime.datetime.now()
-
-if "archived_timestamp_filter" not in st.session_state:
- st.session_state.archived_timestamp_filter = (start_date, end_date)
-
-
-# Pagination Settings
-
-
-def scroll_into_view():
- script = f"""
- <script>
- window.parent.document.querySelector('section.main').scrollTo(0, 0);
- let update_component = {st.session_state.update_component}
- </script>
- """
-
- components.html(script, width=0, height=0)
-
-
-def prev_page():
- st.session_state.offset -= tweets_per_page
-
- st.session_state.update_component += 1
- scroll_into_view()
-
-
-def next_page():
- st.session_state.offset += tweets_per_page
-
- st.session_state.update_component += 1
- scroll_into_view()
-
-
-# Requesting
-
-
-@st.cache_data(ttl=1800, show_spinner=False)
-def tweets_count(username, archived_timestamp_filter):
- url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501
-
- try:
- response = requests.get(url)
-
- if response.status_code == 200:
- data = response.json()
- if data and len(data) > 1:
- total_tweets = len(data) - 1
- return total_tweets
- else:
- return 0
- except requests.exceptions.Timeout:
- st.error("Connection to web.archive.org timed out.")
- st.stop()
- except requests.exceptions.ConnectionError:
- st.error("Failed to establish a new connection with web.archive.org.")
- st.stop()
- except UnboundLocalError:
- st.empty()
-
-
-# Interface Settings
-
-
-st.logo(LOGO)
-
-st.success(
- """**New Feature: CLI**
-
-You can now retrieve archived tweets using the Wayback Tweets command line tool.
-Download the archived tweets' CDX data in CSV, JSON, and HTML formats.
-
-For more details, [read the documentation](https://github.com/claromes/waybacktweets).""" # noqa: E501
-)
-
-st.title(
- "Wayback Tweets",
- anchor=False,
-)
-st.caption(
- "[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)" # noqa: E501
-)
-st.caption("Display multiple archived tweets on Wayback Machine")
-
-username = st.text_input("Username", placeholder="Without @")
-
-start_date = datetime.datetime(2006, 3, 1)
-end_date = datetime.datetime.now()
-
-st.session_state.archived_timestamp_filter = st.date_input(
- "Tweets saved between",
- (start_date, end_date),
- start_date,
- end_date,
- format="YYYY/MM/DD",
- help="YYYY/MM/DD",
-)
-
-not_available = st.checkbox("Only tweets not available")
-
-unique = st.checkbox(
- "Only unique URLs",
- help="Filtering by the collapse option using the urlkey field",
-)
-
-query = st.button("Query", type="primary", use_container_width=True)
-
-# Tweet Listing Settings
-
-
-if username != st.session_state.current_username:
- st.session_state.current_username = username
- st.session_state.offset = 0
-
-if query or st.session_state.count:
- tweets_per_page = 25
-
- st.session_state.count = tweets_count(
- username, st.session_state.archived_timestamp_filter
- )
-
- st.caption(
- "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
- )
- st.write(f"**{st.session_state.count} URLs have been captured**")
-
- if st.session_state.count:
- if tweets_per_page > st.session_state.count:
- tweets_per_page = st.session_state.count
-
- try:
- progress = st.empty()
-
- # Tweet Listing Processing
-
- response = WaybackTweets(
- username,
- unique,
- st.session_state.archived_timestamp_filter[0],
- st.session_state.archived_timestamp_filter[1],
- tweets_per_page,
- )
- archived_tweets = response.get()
-
- with st.spinner("Parsing..."):
- if archived_tweets:
- field_options = [
- "archived_urlkey",
- "archived_timestamp",
- "original_tweet_url",
- "archived_tweet_url",
- "parsed_tweet_url",
- "parsed_archived_tweet_url",
- "parsed_tweet_text_mimetype_json",
- "available_tweet_text",
- "available_tweet_is_RT",
- "available_tweet_info",
- "archived_mimetype",
- "archived_statuscode",
- ]
-
- parser = TweetsParser(archived_tweets, username, field_options)
- parsed_tweets = parser.parse()
-
- exporter = TweetsExporter(parsed_tweets, username, field_options)
- df = exporter.dataframe
-
- # file_path = "claromes_tweets_20240610210338.csv"
- # df = pd.read_csv(file_path)
- # df = df.fillna("")
-
- archived_urlkey = df["archived_urlkey"]
- archived_timestamp = df["archived_timestamp"]
- original_tweet_url = df["original_tweet_url"]
- archived_tweet_url = df["archived_tweet_url"]
- parsed_tweet_url = df["parsed_tweet_url"]
- parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
- parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"]
- available_tweet_text = df["available_tweet_text"]
- available_tweet_is_RT = df["available_tweet_is_RT"]
- available_tweet_info = df["available_tweet_info"]
- archived_mimetype = df["archived_mimetype"]
- archived_statuscode = df["archived_statuscode"]
-
- st.divider()
-
- st.session_state.current_username = username
-
- return_none_count = 0
-
- start_index = st.session_state.offset
- end_index = min(st.session_state.count, start_index + tweets_per_page)
-
- for i in range(tweets_per_page):
- try:
- if original_tweet_url[i]:
-
- # Display all tweets
- if not not_available:
- st.markdown(
- f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
- )
-
- # Display available tweets
- if available_tweet_text[i]:
- if available_tweet_is_RT[i]:
- st.info("*Retweet*")
-
- st.write(available_tweet_text[i])
- st.write(f"**{available_tweet_info[i]}**")
-
- st.divider()
-
- # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
- elif (
- archived_mimetype[i] != "application/json"
- and not available_tweet_text[i]
- ):
- if (
- ".jpg" in original_tweet_url[i]
- or ".png" in original_tweet_url[i]
- ) and (400 <= archived_statuscode[i] <= 511):
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
- elif "/status/" not in original_tweet_url[i]:
- st.info(
- "This isn't a status or is not available"
- )
- elif (
- check_double_status(
- archived_tweet_url[i], original_tweet_url[i]
- )
- or f"{st.session_state.current_username}"
- not in original_tweet_url[i]
- ):
- st.info(
- f"Replying to {st.session_state.current_username}" # noqa: E501
- )
- else:
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
-
- st.divider()
-
- # Display tweets not available with application/json return # noqa: E501
- elif (
- archived_mimetype[i] == "application/json"
- and not available_tweet_text[i]
- ):
- st.code(parsed_tweet_text_mimetype_json[i])
- # st.json(json_data, expanded=False)
-
- st.divider()
-
- # Display only tweets not available
- if not_available:
- return_none_count += 1
-
- st.markdown(
- f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
- )
-
- # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
- if (
- archived_mimetype[i] != "application/json"
- and not available_tweet_text[i]
- ):
- if (
- ".jpg" in original_tweet_url[i]
- or ".png" in original_tweet_url[i]
- ) and (400 <= archived_statuscode[i] <= 511):
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
- elif "/status/" not in original_tweet_url[i]:
- st.info(
- "This isn't a status or is not available"
- )
- elif (
- check_double_status(
- archived_tweet_url[i], original_tweet_url[i]
- )
- or f"{st.session_state.current_username}"
- not in original_tweet_url[i]
- ):
- st.info(
- f"Replying to {st.session_state.current_username}" # noqa: E501
- )
- else:
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
-
- st.divider()
-
- # Display tweets not available with application/json return # noqa: E501
- elif (
- archived_mimetype[i] == "application/json"
- and not available_tweet_text[i]
- ):
- st.code(parsed_tweet_text_mimetype_json[i])
- # st.json(json_data, expanded=False)
-
- st.divider()
-
- progress.write(
- f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
- )
- pass
- except IndexError:
- if start_index <= 0:
- st.session_state.prev_disabled = True
- else:
- st.session_state.prev_disabled = False
-
- st.session_state.next_disabled = True
-
- prev, _, next = st.columns([3, 4, 3])
-
- prev.button(
- "Previous",
- disabled=st.session_state.prev_disabled,
- key="prev_button_key",
- on_click=prev_page,
- type="primary",
- use_container_width=True,
- )
- next.button(
- "Next",
- disabled=st.session_state.next_disabled,
- key="next_button_key",
- on_click=next_page,
- type="primary",
- use_container_width=True,
- )
-
- if not archived_tweets:
- st.error("Unable to query the Wayback Machine API.")
- except TypeError as e:
- st.error(
- f"""
- {e}. Refresh this page and try again.
-
- If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501
- )
- st.session_state.offset = 0
from datetime import datetime
import click
+from requests import exceptions
from rich import print as rprint
from waybacktweets.export_tweets import TweetsExporter
exporter.save_to_json()
exporter.save_to_html()
- except TypeError as e:
+ except exceptions as e:
rprint(f"[red]{e}")
finally:
rprint(
import re
-import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import unquote
-import requests
+from requests import exceptions
from rich import print as rprint
from rich.progress import Progress
check_pattern_tweet,
clean_tweet_url,
delete_tweet_pathnames,
+ get_response,
semicolon_parser,
)
"""Parses the archived tweets when they are still available."""
try:
url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
- response = requests.get(url)
+ response = get_response(url=url)
if response:
json_response = response.json()
is_RT.append(author_name != author_tweet)
return tweet_content, is_RT, user_info
- except Exception:
+ except exceptions:
rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
return None
def parse(self):
"""Parses the archived tweets in JSON format."""
-
- max_attempts = 5
try:
- for attempt in range(max_attempts):
- try:
- response = requests.get(self.archived_tweet_url)
- break
- except requests.exceptions.ConnectionError:
- if attempt < max_attempts - 1:
- time.sleep(0.5)
- else:
- raise
+ response = get_response(url=self.archived_tweet_url)
if response:
json_data = response.json()
)
return json_data.get("text", json_data)
- except Exception:
+ except exceptions.ConnectionError:
rprint(
- f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved." # noqa: E501
+ f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
)
+ return ""
+ except exceptions:
+ rprint("[yellow]Error parsing the JSON, but the CDX data was saved.")
return ""
try:
future.result()
except Exception as e:
- rprint(f"[red]{e}")
+ rprint(f"[red]{e}...")
progress.update(task, advance=1)
-import requests
+from requests import exceptions
from rich import print as rprint
+from waybacktweets.utils import get_response
+
class WaybackTweets:
"""Requests data from the Wayback CDX Server API and returns it in JSON format."""
print("Making a request to the Internet Archive...")
try:
- response = requests.get(url, params=params)
+ response = get_response(url=url, params=params)
if response:
return response.json()
- except requests.exceptions.ReadTimeout:
+ except exceptions.ReadTimeout:
rprint("[red]Connection to web.archive.org timed out.")
- except requests.exceptions.ConnectionError:
- rprint("[red]Failed to establish a new connection with web.archive.org.")
- except requests.exceptions.HTTPError:
+ except exceptions.ConnectionError:
+ rprint(
+ "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded." # noqa: E501
+ )
+ except exceptions.HTTPError:
rprint(
"[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501
)
import re
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+
+def get_response(url, params=None):
+ """Sends a GET request to the specified URL and returns the response."""
+ session = requests.Session()
+ retry = Retry(connect=3, backoff_factor=0.3)
+ adapter = HTTPAdapter(max_retries=retry)
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" # noqa: E501
+ }
+
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+
+ response = session.get(url, params=params, headers=headers)
+
+ if not 400 <= response.status_code <= 511:
+ return response
+
def clean_tweet_url(tweet_url, username):
"""
def generate(self):
"""Generates an HTML file."""
+
html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
html += "<style>\n"
html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
html += '<div class="tweet">\n'
if (
- tweet["archived_mimetype"] != "application/json"
+ (
+ tweet["archived_mimetype"] != "application/json"
+ and not tweet["parsed_tweet_text_mimetype_json"]
+ )
+ and not tweet["available_tweet_text"]
+ ) or (
+ (
+ tweet["archived_mimetype"] == "application/json"
+ and not tweet["parsed_tweet_text_mimetype_json"]
+ )
and not tweet["available_tweet_text"]
):
html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
- if tweet["archived_mimetype"] == "application/json":
+ if (
+ tweet["archived_mimetype"] == "application/json"
+ and tweet["parsed_tweet_text_mimetype_json"]
+ ) and not tweet["available_tweet_text"]:
html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'
html += "<br>\n"