From bcf26caf880ed7c1be894be90c7d3044df758870 Mon Sep 17 00:00:00 2001 From: Claromes Date: Tue, 11 Jun 2024 18:49:54 -0300 Subject: [PATCH] update streamlit app --- .gitignore | 3 +- .streamlit/config.toml | 11 - app.py | 520 -------------------------------- app/.streamlit/config.toml | 11 + app/assets/parthenon.svg | 26 ++ app/new_app.py | 427 ++++++++++++++++++++++++++ docs/index.md | 15 + mkdocs.yml | 5 + poetry.lock | 215 ++++++------- pyproject.toml | 2 +- waybacktweets/parse_tweets.py | 31 +- waybacktweets/request_tweets.py | 12 +- 12 files changed, 607 insertions(+), 671 deletions(-) delete mode 100644 .streamlit/config.toml delete mode 100644 app.py create mode 100644 app/.streamlit/config.toml create mode 100644 app/assets/parthenon.svg create mode 100644 app/new_app.py diff --git a/.gitignore b/.gitignore index a18b689..3f6005e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,4 @@ *.json *.html waybacktweets/__pycache__ -waybacktweets/notes.md -.vscode +notes.md diff --git a/.streamlit/config.toml b/.streamlit/config.toml deleted file mode 100644 index 2f7df50..0000000 --- a/.streamlit/config.toml +++ /dev/null @@ -1,11 +0,0 @@ -[theme] -base="light" -primaryColor="#ab2e33" -secondaryBackgroundColor="#efefef" -textColor="#000000" -backgroundColor="#f9f9f9" -font="serif" - -[client] -displayEnabled=true -toolbarMode="minimal" diff --git a/app.py b/app.py deleted file mode 100644 index f366ee4..0000000 --- a/app.py +++ /dev/null @@ -1,520 +0,0 @@ -import requests -import datetime -import streamlit as st -import streamlit.components.v1 as components -import json -import re -from urllib.parse import unquote - -year = datetime.datetime.now().year - -st.set_page_config(page_title='Wayback Tweets', - page_icon='🏛️', - layout='centered', - menu_items={ - 'About': - ''' - ## 🏛️ Wayback Tweets - - [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) - - Tool that displays, via Wayback CDX Server API, multiple archived tweets on Wayback Machine to avoid opening each link manually. Users can apply filters based on specific years and view tweets that do not have the original URL available. - - This tool is a prototype, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues). Created and maintained by [@claromes](https://github.com/claromes). - - ------- - ''', - 'Report a bug': - 'https://github.com/claromes/waybacktweets/issues' - }) - -# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3 -hide_streamlit_style = ''' - -''' - -st.markdown(hide_streamlit_style, unsafe_allow_html=True) - -if 'current_handle' not in st.session_state: - st.session_state.current_handle = '' - -if 'prev_disabled' not in st.session_state: - st.session_state.prev_disabled = False - -if 'next_disabled' not in st.session_state: - st.session_state.next_disabled = False - -if 'next_button' not in st.session_state: - st.session_state.next_button = False - -if 'prev_button' not in st.session_state: - st.session_state.prev_button = False - -if 'update_component' not in st.session_state: - st.session_state.update_component = 0 - -if 'offset' not in st.session_state: - st.session_state.offset = 0 - -if 'saved_at' not in st.session_state: - st.session_state.saved_at = (2006, year) - -if 'count' not in st.session_state: - st.session_state.count = False - - -def scroll_into_view(): - js = f''' - - ''' - - components.html(js, width=0, height=0) - - -def clean_tweet(tweet): - handle = st.session_state.current_handle.lower() - tweet_lower = tweet.lower() - - pattern = re.compile(r'/status/(\d+)') - match_lower_case = pattern.search(tweet_lower) - match_original_case = pattern.search(tweet) - - if match_lower_case and handle in tweet_lower: - return f'https://twitter.com/{st.session_state.current_handle}/status/{match_original_case.group(1)}' - else: - return tweet - - -def clean_link(link): - handle = st.session_state.current_handle.lower() - link = link.lower() - - pattern = re.compile(r'/status/(\d+)') - match = pattern.search(link) - - if match and handle in link: - return f'https://web.archive.org/web/{timestamp[i]}/https://twitter.com/{st.session_state.current_handle}/status/{match.group(1)}' - else: - return link - - -def pattern_tweet(tweet): - # Reply: /status// - # Link: /status/// - # Twimg: /status/https://pbs - - pattern = re.compile(r'/status/"([^"]+)"') - - match = pattern.search(tweet) - if match: - return match.group(1).lstrip('/') - else: - return tweet - - -def pattern_tweet_id(tweet): - # Delete sub-endpoint (/photos, /likes, /retweet...) - pattern_username = re.compile(r'https://twitter\.com/([^/]+)/status/\d+') - match_username = pattern_username.match(tweet) - - pattern_id = r'https://twitter.com/\w+/status/(\d+)' - match_id = re.search(pattern_id, tweet) - - if match_id and match_username: - tweet_id = match_id.group(1) - username = match_username.group(1) - return f'https://twitter.com/{username}/status/{tweet_id}' - else: - return tweet - - -def check_double_status(url_wb, url_tweet): - if url_wb.count('/status/') == 2 and not 'twitter.com' in url_tweet: - return True - - return False - - -def embed(tweet): - try: - url = f'https://publish.twitter.com/oembed?url={clean_tweet(tweet)}' - response = requests.get(url) - - regex = r'
]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>' - regex_author = r'^(.*?)\s*\(' - - if response.status_code == 200 or response.status_code == 302: - status_code = response.status_code - html = response.json()['html'] - author_name = response.json()['author_name'] - - matches_html = re.findall(regex, html, re.DOTALL) - - tweet_content = [] - user_info = [] - is_RT = [] - - for match in matches_html: - tweet_content_match = re.sub(r']*>|<\/a>', '', - match[0].strip()) - tweet_content_match = tweet_content_match.replace('
', '\n') - - user_info_match = re.sub(r']*>|<\/a>', '', - match[1].strip()) - user_info_match = user_info_match.replace(')', '), ') - - match_author = re.search(regex_author, user_info_match) - author_tweet = match_author.group(1) - - if tweet_content_match: - tweet_content.append(tweet_content_match) - if user_info_match: - user_info.append(user_info_match) - - is_RT_match = False - if author_name != author_tweet: - is_RT_match = True - - is_RT.append(is_RT_match) - - return status_code, tweet_content, user_info, is_RT - else: - return False - except requests.exceptions.Timeout: - st.error('Connection to web.archive.org timed out.') - except requests.exceptions.ConnectionError: - st.error('Failed to establish a new connection with web.archive.org.') - except UnboundLocalError: - st.empty() - - -@st.cache_data(ttl=1800, show_spinner=False) -def tweets_count(handle, saved_at): - url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&from={saved_at[0]}&to={saved_at[1]}' - try: - response = requests.get(url) - - if response.status_code == 200: - data = response.json() - if data and len(data) > 1: - total_tweets = len(data) - 1 - return total_tweets - else: - return 0 - except requests.exceptions.Timeout: - st.error('Connection to web.archive.org timed out.') - st.stop() - except requests.exceptions.ConnectionError: - st.error('Failed to establish a new connection with web.archive.org.') - except UnboundLocalError: - st.empty() - - -@st.cache_data(ttl=1800, show_spinner=False) -def query_api(handle, limit, offset, saved_at): - if not handle: - st.warning('username, please!') - st.stop() - - url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&limit={limit}&offset={offset}&from={saved_at[0]}&to={saved_at[1]}' - try: - response = requests.get(url) - response.raise_for_status() - - if response.status_code == 200 or response.status_code == 304: - return response.json() - except requests.exceptions.Timeout: - st.error('Connection to web.archive.org timed out.') - except requests.exceptions.ConnectionError: - st.error('Failed to establish a new connection with web.archive.org.') - except UnboundLocalError: - st.empty() - except requests.exceptions.HTTPError: - st.error(''' - **Temporarily Offline** - - Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information. - ''') - st.stop() - - -@st.cache_data(ttl=1800, show_spinner=False) -def parse_links(links): - parsed_links = [] - timestamp = [] - tweet_links = [] - parsed_mimetype = [] - - for link in links[1:]: - tweet_remove_char = unquote(link[2]).replace('’', '') - cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"') - - url = f'https://web.archive.org/web/{link[1]}/{tweet_remove_char}' - - parsed_links.append(url) - timestamp.append(link[1]) - tweet_links.append(cleaned_tweet) - parsed_mimetype.append(link[3]) - - return parsed_links, tweet_links, parsed_mimetype, timestamp - - -def attr(i): - original_tweet = pattern_tweet_id(clean_tweet(tweet_links[i])) - - if status: - original_tweet = pattern_tweet_id( - f'https://twitter.com/{tweet_links[i]}') - elif not '://' in tweet_links[i]: - original_tweet = pattern_tweet_id(f'https://{tweet_links[i]}') - - st.markdown( - f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}' - ) - - -def display_tweet(): - if mimetype[i] == 'application/json' or mimetype[ - i] == 'text/html' or mimetype[i] == 'unk' or mimetype[ - i] == 'warc/revisit': - if is_RT[0] == True: - st.info('*Retweet*') - st.write(tweet_content[0]) - st.write(f'**{user_info[0]}**') - - st.divider() - else: - st.warning('MIME Type was not parsed.') - - st.divider() - - -def display_not_tweet(): - original_link = pattern_tweet_id(clean_tweet(tweet_links[i])) - - if status: - original_link = pattern_tweet_id( - f'https://twitter.com/{tweet_links[i]}') - elif not '://' in tweet_links[i]: - original_link = pattern_tweet_id(f'https://{tweet_links[i]}') - - response_html = requests.get(original_link) - - if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[ - i] == 'unk': - if ('.jpg' in tweet_links[i] or '.png' - in tweet_links[i]) and response_html.status_code == 200: - components.iframe(tweet_links[i], height=500, scrolling=True) - elif '/status/' not in original_link: - st.info("This isn't a status or is not available") - elif status or f'{st.session_state.current_handle}' not in original_link: - st.info(f'Replying to {st.session_state.current_handle}') - else: - components.iframe(clean_link(link), height=500, scrolling=True) - - st.divider() - elif mimetype[i] == 'application/json': - try: - response_json = requests.get(link) - - if response_json.status_code == 200: - json_data = response_json.json() - - if 'data' in json_data: - if 'text' in json_data['data']: - json_text = json_data['data']['text'] - else: - json_text = json_data['data'] - else: - if 'text' in json_data: - json_text = json_data['text'] - else: - json_text = json_data - - st.code(json_text) - st.json(json_data, expanded=False) - - st.divider() - else: - st.error(response_json.status_code) - - st.divider() - except requests.exceptions.Timeout: - st.error('Connection to web.archive.org timed out.') - st.divider() - except requests.exceptions.ConnectionError: - st.error( - 'Failed to establish a new connection with web.archive.org.') - st.divider() - except UnboundLocalError: - st.empty() - else: - st.warning('MIME Type was not parsed.') - st.divider() - - -def prev_page(): - st.session_state.offset -= tweets_per_page - - #scroll to top config - st.session_state.update_component += 1 - scroll_into_view() - - -def next_page(): - st.session_state.offset += tweets_per_page - - #scroll to top config - st.session_state.update_component += 1 - scroll_into_view() - - -# UI -st.title( - 'Wayback Tweets [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)', - anchor=False) -st.write( - 'Display multiple archived tweets on Wayback Machine and avoid opening each link manually' -) - -handle = st.text_input('Username', placeholder='jack') - -st.session_state.saved_at = st.slider('Tweets saved between', 2006, year, - (2006, year)) - -not_available = st.checkbox( - 'Original URLs not available', - help= - 'Due to changes in X, it is possible to find available tweets if you are logged into X' -) - -query = st.button('Query', type='primary', use_container_width=True) - -if handle != st.session_state.current_handle: - st.session_state.current_handle = handle - st.session_state.offset = 0 - -if query or st.session_state.count: - tweets_per_page = 25 - - st.session_state.count = tweets_count(handle, st.session_state.saved_at) - - st.caption( - 'The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit.' - ) - st.write(f'**{st.session_state.count} URLs have been captured**') - - if st.session_state.count: - if tweets_per_page > st.session_state.count: - tweets_per_page = st.session_state.count - - try: - progress = st.empty() - links = query_api(handle, tweets_per_page, st.session_state.offset, - st.session_state.saved_at) - - parse = parse_links(links) - parsed_links = parse[0] - tweet_links = parse[1] - mimetype = parse[2] - timestamp = parse[3] - - if links: - st.divider() - - st.session_state.current_handle = handle - - return_none_count = 0 - - start_index = st.session_state.offset - end_index = min(st.session_state.count, - start_index + tweets_per_page) - - with st.spinner('Fetching...'): - for i in range(tweets_per_page): - try: - if tweet_links[i]: - link = parsed_links[i] - tweet = embed(tweet_links[i]) - - status = check_double_status(link, tweet_links[i]) - - if not not_available: - attr(i) - - if tweet: - status_code = tweet[0] - tweet_content = tweet[1] - user_info = tweet[2] - is_RT = tweet[3] - - display_tweet() - elif not tweet: - display_not_tweet() - - if not_available: - if not tweet: - return_none_count += 1 - attr(i) - - display_not_tweet() - - progress.write( - f'{return_none_count} URLs have been captured in the range {start_index}-{end_index}' - ) - - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False - - if i + 1 == st.session_state.count: - st.session_state.next_disabled = True - else: - st.session_state.next_disabled = False - except IndexError: - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False - - st.session_state.next_disabled = True - - prev, _, next = st.columns([3, 4, 3]) - - prev.button('Previous', - disabled=st.session_state.prev_disabled, - key='prev_button_key', - on_click=prev_page, - type='primary', - use_container_width=True) - next.button('Next', - disabled=st.session_state.next_disabled, - key='next_button_key', - on_click=next_page, - type='primary', - use_container_width=True) - - if not links: - st.error('Unable to query the Wayback Machine API.') - except TypeError as e: - st.error(f''' - {e}. Refresh this page and try again. - - If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues). - ''') - st.session_state.offset = 0 diff --git a/app/.streamlit/config.toml b/app/.streamlit/config.toml new file mode 100644 index 0000000..cefb509 --- /dev/null +++ b/app/.streamlit/config.toml @@ -0,0 +1,11 @@ +[theme] +base = "light" +primaryColor = "#ef5552" +secondaryBackgroundColor = "#efefef" +textColor = "#000000" +backgroundColor = "#f9f9f9" +font = "serif" + +[client] +displayEnabled = true +toolbarMode = "minimal" diff --git a/app/assets/parthenon.svg b/app/assets/parthenon.svg new file mode 100644 index 0000000..babc09e --- /dev/null +++ b/app/assets/parthenon.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/app/new_app.py b/app/new_app.py new file mode 100644 index 0000000..6f3eabf --- /dev/null +++ b/app/new_app.py @@ -0,0 +1,427 @@ +import datetime + +import requests +import streamlit as st +import streamlit.components.v1 as components + +from waybacktweets.export_tweets import TweetsExporter +from waybacktweets.parse_tweets import TweetsParser +from waybacktweets.request_tweets import WaybackTweets +from waybacktweets.utils import check_double_status + +# Initial Settings + +LOGO = "app/assets/parthenon.svg" + +st.set_page_config( + page_title="Wayback Tweets", + page_icon=LOGO, + layout="centered", + menu_items={ + "Report a bug": "https://github.com/claromes/waybacktweets/issues", + }, +) + +# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3 +st.html( + """ + +""" +) + +if "current_username" not in st.session_state: + st.session_state.current_username = "" + +if "prev_disabled" not in st.session_state: + st.session_state.prev_disabled = False + +if "next_disabled" not in st.session_state: + st.session_state.next_disabled = False + +if "next_button" not in st.session_state: + st.session_state.next_button = False + +if "prev_button" not in st.session_state: + st.session_state.prev_button = False + +if "update_component" not in st.session_state: + st.session_state.update_component = 0 + +if "offset" not in st.session_state: + st.session_state.offset = 0 + +if "count" not in st.session_state: + st.session_state.count = False + +start_date = datetime.datetime(2006, 3, 1) +end_date = datetime.datetime.now() + +if "archived_timestamp_filter" not in st.session_state: + st.session_state.archived_timestamp_filter = (start_date, end_date) + + +# Pagination Settings + + +def scroll_into_view(): + script = f""" + + """ + + components.html(script, width=0, height=0) + + +def prev_page(): + st.session_state.offset -= tweets_per_page + + st.session_state.update_component += 1 + scroll_into_view() + + +def next_page(): + st.session_state.offset += tweets_per_page + + st.session_state.update_component += 1 + scroll_into_view() + + +# Requesting + + +@st.cache_data(ttl=1800, show_spinner=False) +def tweets_count(username, archived_timestamp_filter): + url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&collapse=timestamp:8&output=json&from={archived_timestamp_filter[0]}&to={archived_timestamp_filter[1]}" # noqa: E501 + + try: + response = requests.get(url) + + if response.status_code == 200: + data = response.json() + if data and len(data) > 1: + total_tweets = len(data) - 1 + return total_tweets + else: + return 0 + except requests.exceptions.Timeout: + st.error("Connection to web.archive.org timed out.") + st.stop() + except requests.exceptions.ConnectionError: + st.error("Failed to establish a new connection with web.archive.org.") + st.stop() + except UnboundLocalError: + st.empty() + + +# Interface Settings + + +st.logo(LOGO) + +st.success( + """**New Feature: CLI** + +You can now retrieve archived tweets using the Wayback Tweets command line tool. +Download the archived tweets' CDX data in CSV, JSON, and HTML formats. + +For more details, [read the documentation](https://github.com/claromes/waybacktweets).""" # noqa: E501 +) + +st.title( + "Wayback Tweets", + anchor=False, +) +st.caption( + "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)" # noqa: E501 +) +st.caption("Display multiple archived tweets on Wayback Machine") + +username = st.text_input("Username", placeholder="Without @") + +start_date = datetime.datetime(2006, 3, 1) +end_date = datetime.datetime.now() + +st.session_state.archived_timestamp_filter = st.date_input( + "Tweets saved between", + (start_date, end_date), + start_date, + end_date, + format="YYYY/MM/DD", + help="YYYY/MM/DD", +) + +not_available = st.checkbox("Only tweets not available") + +unique = st.checkbox( + "Only unique URLs", + help="Filtering by the collapse option using the urlkey field", +) + +query = st.button("Query", type="primary", use_container_width=True) + +# Tweet Listing Settings + + +if username != st.session_state.current_username: + st.session_state.current_username = username + st.session_state.offset = 0 + +if query or st.session_state.count: + tweets_per_page = 25 + + st.session_state.count = tweets_count( + username, st.session_state.archived_timestamp_filter + ) + + st.caption( + "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 + ) + st.write(f"**{st.session_state.count} URLs have been captured**") + + if st.session_state.count: + if tweets_per_page > st.session_state.count: + tweets_per_page = st.session_state.count + + try: + progress = st.empty() + + # Tweet Listing Processing + + response = WaybackTweets( + username, + unique, + st.session_state.archived_timestamp_filter[0], + st.session_state.archived_timestamp_filter[1], + tweets_per_page, + ) + archived_tweets = response.get() + + with st.spinner("Parsing..."): + if archived_tweets: + field_options = [ + "archived_urlkey", + "archived_timestamp", + "original_tweet_url", + "archived_tweet_url", + "parsed_tweet_url", + "parsed_archived_tweet_url", + "parsed_tweet_text_mimetype_json", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", + "archived_statuscode", + ] + + parser = TweetsParser(archived_tweets, username, field_options) + parsed_tweets = parser.parse() + + exporter = TweetsExporter(parsed_tweets, username, field_options) + df = exporter.dataframe + + # file_path = "claromes_tweets_20240610210338.csv" + # df = pd.read_csv(file_path) + # df = df.fillna("") + + archived_urlkey = df["archived_urlkey"] + archived_timestamp = df["archived_timestamp"] + original_tweet_url = df["original_tweet_url"] + archived_tweet_url = df["archived_tweet_url"] + parsed_tweet_url = df["parsed_tweet_url"] + parsed_archived_tweet_url = df["parsed_archived_tweet_url"] + parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"] + available_tweet_text = df["available_tweet_text"] + available_tweet_is_RT = df["available_tweet_is_RT"] + available_tweet_info = df["available_tweet_info"] + archived_mimetype = df["archived_mimetype"] + archived_statuscode = df["archived_statuscode"] + + st.divider() + + st.session_state.current_username = username + + return_none_count = 0 + + start_index = st.session_state.offset + end_index = min(st.session_state.count, start_index + tweets_per_page) + + for i in range(tweets_per_page): + try: + if original_tweet_url[i]: + + # Display all tweets + if not not_available: + st.markdown( + f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + ) + + # Display available tweets + if available_tweet_text[i]: + if available_tweet_is_RT[i]: + st.info("*Retweet*") + + st.write(available_tweet_text[i]) + st.write(f"**{available_tweet_info[i]}**") + + st.divider() + + # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 + elif ( + archived_mimetype[i] != "application/json" + and not available_tweet_text[i] + ): + if ( + ".jpg" in original_tweet_url[i] + or ".png" in original_tweet_url[i] + ) and (400 <= archived_statuscode[i] <= 511): + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + elif "/status/" not in original_tweet_url[i]: + st.info( + "This isn't a status or is not available" + ) + elif ( + check_double_status( + archived_tweet_url[i], original_tweet_url[i] + ) + or f"{st.session_state.current_username}" + not in original_tweet_url[i] + ): + st.info( + f"Replying to {st.session_state.current_username}" # noqa: E501 + ) + else: + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + + st.divider() + + # Display tweets not available with application/json return # noqa: E501 + elif ( + archived_mimetype[i] == "application/json" + and not available_tweet_text[i] + ): + st.code(parsed_tweet_text_mimetype_json[i]) + # st.json(json_data, expanded=False) + + st.divider() + + # Display only tweets not available + if not_available: + return_none_count += 1 + + st.markdown( + f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **MIME type:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + ) + + # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 + if ( + archived_mimetype[i] != "application/json" + and not available_tweet_text[i] + ): + if ( + ".jpg" in original_tweet_url[i] + or ".png" in original_tweet_url[i] + ) and (400 <= archived_statuscode[i] <= 511): + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + elif "/status/" not in original_tweet_url[i]: + st.info( + "This isn't a status or is not available" + ) + elif ( + check_double_status( + archived_tweet_url[i], original_tweet_url[i] + ) + or f"{st.session_state.current_username}" + not in original_tweet_url[i] + ): + st.info( + f"Replying to {st.session_state.current_username}" # noqa: E501 + ) + else: + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + + st.divider() + + # Display tweets not available with application/json return # noqa: E501 + elif ( + archived_mimetype[i] == "application/json" + and not available_tweet_text[i] + ): + st.code(parsed_tweet_text_mimetype_json[i]) + # st.json(json_data, expanded=False) + + st.divider() + + progress.write( + f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501 + ) + pass + except IndexError: + if start_index <= 0: + st.session_state.prev_disabled = True + else: + st.session_state.prev_disabled = False + + st.session_state.next_disabled = True + + prev, _, next = st.columns([3, 4, 3]) + + prev.button( + "Previous", + disabled=st.session_state.prev_disabled, + key="prev_button_key", + on_click=prev_page, + type="primary", + use_container_width=True, + ) + next.button( + "Next", + disabled=st.session_state.next_disabled, + key="next_button_key", + on_click=next_page, + type="primary", + use_container_width=True, + ) + + if not archived_tweets: + st.error("Unable to query the Wayback Machine API.") + except TypeError as e: + st.error( + f""" + {e}. Refresh this page and try again. + + If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501 + ) + st.session_state.offset = 0 diff --git a/docs/index.md b/docs/index.md index 0c0afb0..1523c52 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,3 +3,18 @@ # Wayback Tweets Retrieves archived tweets' CDX data from the Wayback Machine, performs necessary parsing, and saves the data. + +## Workflow + +```mermaid +flowchart TB + A[input Username] --> B[(Wayback Machine)] + B --> C{embed Tweet URL\nvia Twitter Publisher} + C --> |2xx/3xx| D[return Tweet text] + C --> |4xx| E[return None] + E --> F{request Archived\nTweet URL} + F --> |2xx/3xx| I{Parsing} + F --> |4xx| G[return Only CDX data] + I --> |application/json| J[return JSON text] + I --> |text/html| K[return HTML iframe tag] +``` diff --git a/mkdocs.yml b/mkdocs.yml index d744b2c..93f1a26 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,6 +46,11 @@ markdown_extensions: options: custom_icons: - overrides/.icons + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format extra_css: - stylesheets/extra.css diff --git a/poetry.lock b/poetry.lock index ae31e45..03e0652 100644 --- a/poetry.lock +++ b/poetry.lock @@ -500,22 +500,22 @@ files = [ [[package]] name = "importlib-metadata" -version = "6.11.0" +version = "7.1.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-6.11.0-py3-none-any.whl", hash = "sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b"}, - {file = "importlib_metadata-6.11.0.tar.gz", hash = "sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443"}, + {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"}, + {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"}, ] [package.dependencies] zipp = ">=0.5" [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] [[package]] name = "isort" @@ -884,13 +884,13 @@ files = [ [[package]] name = "packaging" -version = "23.2" +version = "24.1" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, - {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] [[package]] @@ -989,82 +989,89 @@ files = [ [[package]] name = "pillow" -version = "9.5.0" +version = "10.3.0" description = "Python Imaging Library (Fork)" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, - {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d"}, - {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903"}, - {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a"}, - {file = "Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44"}, - {file = "Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb"}, - {file = "Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32"}, - {file = "Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625"}, - {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579"}, - {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296"}, - {file = "Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec"}, - {file = "Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4"}, - {file = "Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089"}, - {file = "Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb"}, - {file = "Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b"}, - {file = "Pillow-9.5.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:5d4ebf8e1db4441a55c509c4baa7a0587a0210f7cd25fcfe74dbbce7a4bd1906"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:375f6e5ee9620a271acb6820b3d1e94ffa8e741c0601db4c0c4d3cb0a9c224bf"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99eb6cafb6ba90e436684e08dad8be1637efb71c4f2180ee6b8f940739406e78"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfaaf10b6172697b9bceb9a3bd7b951819d1ca339a5ef294d1f1ac6d7f63270"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:763782b2e03e45e2c77d7779875f4432e25121ef002a41829d8868700d119392"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:35f6e77122a0c0762268216315bf239cf52b88865bba522999dc38f1c52b9b47"}, - {file = "Pillow-9.5.0-cp37-cp37m-win32.whl", hash = "sha256:aca1c196f407ec7cf04dcbb15d19a43c507a81f7ffc45b690899d6a76ac9fda7"}, - {file = "Pillow-9.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322724c0032af6692456cd6ed554bb85f8149214d97398bb80613b04e33769f6"}, - {file = "Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a0aa9417994d91301056f3d0038af1199eb7adc86e646a36b9e050b06f526597"}, - {file = "Pillow-9.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8286396b351785801a976b1e85ea88e937712ee2c3ac653710a4a57a8da5d9c"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c830a02caeb789633863b466b9de10c015bded434deb3ec87c768e53752ad22a"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbd359831c1657d69bb81f0db962905ee05e5e9451913b18b831febfe0519082"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8fc330c3370a81bbf3f88557097d1ea26cd8b019d6433aa59f71195f5ddebbf"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:7002d0797a3e4193c7cdee3198d7c14f92c0836d6b4a3f3046a64bd1ce8df2bf"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:229e2c79c00e85989a34b5981a2b67aa079fd08c903f0aaead522a1d68d79e51"}, - {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9adf58f5d64e474bed00d69bcd86ec4bcaa4123bfa70a65ce72e424bfb88ed96"}, - {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:662da1f3f89a302cc22faa9f14a262c2e3951f9dbc9617609a47521c69dd9f8f"}, - {file = "Pillow-9.5.0-cp38-cp38-win32.whl", hash = "sha256:6608ff3bf781eee0cd14d0901a2b9cc3d3834516532e3bd673a0a204dc8615fc"}, - {file = "Pillow-9.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:e49eb4e95ff6fd7c0c402508894b1ef0e01b99a44320ba7d8ecbabefddcc5569"}, - {file = "Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66"}, - {file = "Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1"}, - {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a"}, - {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865"}, - {file = "Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964"}, - {file = "Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:833b86a98e0ede388fa29363159c9b1a294b0905b5128baf01db683672f230f5"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaf305d6d40bd9632198c766fb64f0c1a83ca5b667f16c1e79e1661ab5060140"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852ddb76d85f127c135b6dd1f0bb88dbb9ee990d2cd9aa9e28526c93e794fba"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:91ec6fe47b5eb5a9968c79ad9ed78c342b1f97a091677ba0e012701add857829"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cb841572862f629b99725ebaec3287fc6d275be9b14443ea746c1dd325053cbd"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799"}, - {file = "Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1"}, + {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"}, + {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"}, + {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"}, + {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"}, + {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"}, + {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"}, + {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"}, + {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"}, + {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"}, + {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"}, + {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"}, + {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"}, + {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"}, + {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"}, + {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"}, + {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"}, + {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"}, + {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"}, + {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"}, + {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"}, + {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"}, + {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"}, + {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"}, + {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"}, + {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"}, + {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"}, + {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"}, + {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"}, + {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"}, + {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"}, + {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"}, + {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"}, + {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"}, + {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"}, + {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"}, ] [package.extras] docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +typing = ["typing-extensions"] +xmp = ["defusedxml"] [[package]] name = "platformdirs" @@ -1625,13 +1632,13 @@ files = [ [[package]] name = "streamlit" -version = "1.27.0" +version = "1.35.0" description = "A faster way to build and share data apps" optional = false -python-versions = ">=3.8, !=3.9.7" +python-versions = "!=3.9.7,>=3.8" files = [ - {file = "streamlit-1.27.0-py2.py3-none-any.whl", hash = "sha256:7488d4e22689d04f40449a1de521ba252ae95a5fbb5f5c606df16a4e16048b47"}, - {file = "streamlit-1.27.0.tar.gz", hash = "sha256:59a704195bbc669c794ddfcc3818480d9b9c2a282c02b48aa6e6de4b1c38d0a0"}, + {file = "streamlit-1.35.0-py2.py3-none-any.whl", hash = "sha256:e17d1d86830a0d7687c37faf2fe47bffa752d0c95a306e96d7749bd3faa72a5b"}, + {file = "streamlit-1.35.0.tar.gz", hash = "sha256:679d55bb6189743f606abf0696623df0bfd223a6d0c8d96b8d60678d4891d2d6"}, ] [package.dependencies] @@ -1640,27 +1647,23 @@ blinker = ">=1.0.0,<2" cachetools = ">=4.0,<6" click = ">=7.0,<9" gitpython = ">=3.0.7,<3.1.19 || >3.1.19,<4" -importlib-metadata = ">=1.4,<7" numpy = ">=1.19.3,<2" -packaging = ">=16.8,<24" +packaging = ">=16.8,<25" pandas = ">=1.3.0,<3" -pillow = ">=7.1.0,<10" +pillow = ">=7.1.0,<11" protobuf = ">=3.20,<5" -pyarrow = ">=6.0" +pyarrow = ">=7.0" pydeck = ">=0.8.0b4,<1" -python-dateutil = ">=2.7.3,<3" -requests = ">=2.18,<3" +requests = ">=2.27,<3" rich = ">=10.14.0,<14" tenacity = ">=8.1.0,<9" toml = ">=0.10.1,<2" tornado = ">=6.0.3,<7" -typing-extensions = ">=4.1.0,<5" -tzlocal = ">=1.1,<6" -validators = ">=0.2,<1" +typing-extensions = ">=4.3.0,<5" watchdog = {version = ">=2.1.5", markers = "platform_system != \"Darwin\""} [package.extras] -snowflake = ["snowflake-snowpark-python"] +snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python (>=0.9.0)"] [[package]] name = "tenacity" @@ -1752,23 +1755,6 @@ files = [ {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, ] -[[package]] -name = "tzlocal" -version = "5.2" -description = "tzinfo object for the local timezone" -optional = false -python-versions = ">=3.8" -files = [ - {file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"}, - {file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"}, -] - -[package.dependencies] -tzdata = {version = "*", markers = "platform_system == \"Windows\""} - -[package.extras] -devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"] - [[package]] name = "urllib3" version = "2.2.1" @@ -1786,17 +1772,6 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] -[[package]] -name = "validators" -version = "0.28.3" -description = "Python Data Validation for Humans™" -optional = false -python-versions = ">=3.8" -files = [ - {file = "validators-0.28.3-py3-none-any.whl", hash = "sha256:53cafa854f13850156259d9cc479b864ee901f6a96e6b109e6fc33f98f37d99f"}, - {file = "validators-0.28.3.tar.gz", hash = "sha256:c6c79840bcde9ba77b19f6218f7738188115e27830cbaff43264bc4ed24c429d"}, -] - [[package]] name = "virtualenv" version = "20.26.2" @@ -1879,4 +1854,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0" -content-hash = "42b006d1fdee1ed5cf06d63c01ef2f8b4fa94839a0ebf52a9e16d3e85c4ed202" +content-hash = "a19d90802ba9ba601c8056a9920812ccfe59e49a0ae50d92cdf9e89f56c7718f" diff --git a/pyproject.toml b/pyproject.toml index 05bf063..2b05ca8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" [tool.poetry.dependencies] python = ">=3.9,<3.9.7 || >3.9.7,<4.0" requests = "^2.30.0" -streamlit = "1.27.0" +streamlit = "1.35.0" rich = "^13.6.0" httpx = "^0.27.0" click = "^8.1.7" diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py index 6e4c2a3..76ad899 100644 --- a/waybacktweets/parse_tweets.py +++ b/waybacktweets/parse_tweets.py @@ -1,8 +1,9 @@ import re +import time from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import unquote -import httpx +import requests from rich import print as rprint from rich.progress import Progress @@ -25,8 +26,9 @@ class TwitterEmbed: """Parses the archived tweets when they are still available.""" try: url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" - response = httpx.get(url) - if not (400 <= response.status_code <= 511): + response = requests.get(url) + + if response: json_response = response.json() html = json_response["html"] author_name = json_response["author_name"] @@ -73,10 +75,20 @@ class JsonParser: def parse(self): """Parses the archived tweets in JSON format.""" - try: - response = httpx.get(self.archived_tweet_url) - if response and not (400 <= response.status_code <= 511): + max_attempts = 5 + try: + for attempt in range(max_attempts): + try: + response = requests.get(self.archived_tweet_url) + break + except requests.exceptions.ConnectionError: + if attempt < max_attempts - 1: + time.sleep(0.5) + else: + raise + + if response: json_data = response.json() if "data" in json_data: @@ -153,7 +165,7 @@ class TweetsParser: parsed_text_json = "" if response[3] == "application/json": - json_parser = JsonParser(encoded_archived_tweet) + json_parser = JsonParser(encoded_parsed_archived_tweet) if json_parser: text_json = json_parser.parse() parsed_text_json = semicolon_parser(text_json) @@ -185,10 +197,7 @@ class TweetsParser: for future in as_completed(futures): try: - with httpx.Client(timeout=60.0): - future.result() - except httpx.RequestError as e: - rprint(f"[red]{e}") + future.result() except Exception as e: rprint(f"[red]{e}") diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index 78ba5a2..0093629 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -1,4 +1,4 @@ -import httpx +import requests from rich import print as rprint @@ -35,15 +35,15 @@ class WaybackTweets: print("Making a request to the Internet Archive...") try: - response = httpx.get(url, params=params) + response = requests.get(url, params=params) - if not (400 <= response.status_code <= 511): + if response: return response.json() - except httpx._exceptions.ReadTimeout: + except requests.exceptions.ReadTimeout: rprint("[red]Connection to web.archive.org timed out.") - except httpx._exceptions.ConnectError: + except requests.exceptions.ConnectionError: rprint("[red]Failed to establish a new connection with web.archive.org.") - except httpx._exceptions.HTTPError: + except requests.exceptions.HTTPError: rprint( "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501 ) -- 2.34.1