year = datetime.datetime.now().year
-st.set_page_config(
- page_title='Wayback Tweets',
- page_icon='🏛️',
- layout='centered',
- menu_items={
-
- 'About': '''
+st.set_page_config(page_title='Wayback Tweets',
+ page_icon='🏛️',
+ layout='centered',
+ menu_items={
+ 'About':
+ '''
## 🏛️ Wayback Tweets
[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md)
-------
''',
- 'Report a bug': 'https://github.com/claromes/waybacktweets/issues'
- }
-)
+ 'Report a bug':
+ 'https://github.com/claromes/waybacktweets/issues'
+ })
# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
hide_streamlit_style = '''
if 'count' not in st.session_state:
st.session_state.count = False
+
def scroll_into_view():
js = f'''
<script>
components.html(js, width=0, height=0)
+
def clean_tweet(tweet):
handle = st.session_state.current_handle.lower()
tweet_lower = tweet.lower()
else:
return tweet
+
def clean_link(link):
handle = st.session_state.current_handle.lower()
link = link.lower()
else:
return link
+
def pattern_tweet(tweet):
# Reply: /status//
# Link: /status///
else:
return tweet
+
def pattern_tweet_id(tweet):
# Delete sub-endpoint (/photos, /likes, /retweet...)
pattern_username = re.compile(r'https://twitter\.com/([^/]+)/status/\d+')
else:
return tweet
+
def check_double_status(url_wb, url_tweet):
if url_wb.count('/status/') == 2 and not 'twitter.com' in url_tweet:
return True
return False
+
def embed(tweet):
try:
url = f'https://publish.twitter.com/oembed?url={clean_tweet(tweet)}'
is_RT = []
for match in matches_html:
- tweet_content_match = re.sub(r'<a[^>]*>|<\/a>', '', match[0].strip())
+ tweet_content_match = re.sub(r'<a[^>]*>|<\/a>', '',
+ match[0].strip())
tweet_content_match = tweet_content_match.replace('<br>', '\n')
- user_info_match = re.sub(r'<a[^>]*>|<\/a>', '', match[1].strip())
+ user_info_match = re.sub(r'<a[^>]*>|<\/a>', '',
+ match[1].strip())
user_info_match = user_info_match.replace(')', '), ')
match_author = re.search(regex_author, user_info_match)
except UnboundLocalError:
st.empty()
+
@st.cache_data(ttl=1800, show_spinner=False)
def tweets_count(handle, saved_at):
url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&from={saved_at[0]}&to={saved_at[1]}'
except UnboundLocalError:
st.empty()
+
@st.cache_data(ttl=1800, show_spinner=False)
def query_api(handle, limit, offset, saved_at):
if not handle:
except requests.exceptions.HTTPError:
st.error('''
**Temporarily Offline**
-
+
Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information.
''')
st.stop()
+
@st.cache_data(ttl=1800, show_spinner=False)
def parse_links(links):
parsed_links = []
return parsed_links, tweet_links, parsed_mimetype, timestamp
+
def attr(i):
original_tweet = pattern_tweet_id(clean_tweet(tweet_links[i]))
if status:
- original_tweet = pattern_tweet_id(f'https://twitter.com/{tweet_links[i]}')
+ original_tweet = pattern_tweet_id(
+ f'https://twitter.com/{tweet_links[i]}')
elif not '://' in tweet_links[i]:
original_tweet = pattern_tweet_id(f'https://{tweet_links[i]}')
- st.markdown(f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}')
+ st.markdown(
+ f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}'
+ )
+
def display_tweet():
- if mimetype[i] == 'application/json' or mimetype[i] == 'text/html' or mimetype[i] == 'unk' or mimetype[i] == 'warc/revisit':
+ if mimetype[i] == 'application/json' or mimetype[
+ i] == 'text/html' or mimetype[i] == 'unk' or mimetype[
+ i] == 'warc/revisit':
if is_RT[0] == True:
st.info('*Retweet*')
st.write(tweet_content[0])
st.divider()
+
def display_not_tweet():
original_link = pattern_tweet_id(clean_tweet(tweet_links[i]))
if status:
- original_link = pattern_tweet_id(f'https://twitter.com/{tweet_links[i]}')
+ original_link = pattern_tweet_id(
+ f'https://twitter.com/{tweet_links[i]}')
elif not '://' in tweet_links[i]:
original_link = pattern_tweet_id(f'https://{tweet_links[i]}')
response_html = requests.get(original_link)
- if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[i] == 'unk':
- if ('.jpg' in tweet_links[i] or '.png' in tweet_links[i]) and response_html.status_code == 200:
+ if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[
+ i] == 'unk':
+ if ('.jpg' in tweet_links[i] or '.png'
+ in tweet_links[i]) and response_html.status_code == 200:
components.iframe(tweet_links[i], height=500, scrolling=True)
elif '/status/' not in original_link:
st.info("This isn't a status or is not available")
st.error('Connection to web.archive.org timed out.')
st.divider()
except requests.exceptions.ConnectionError:
- st.error('Failed to establish a new connection with web.archive.org.')
+ st.error(
+ 'Failed to establish a new connection with web.archive.org.')
st.divider()
except UnboundLocalError:
st.empty()
st.warning('MIME Type was not parsed.')
st.divider()
+
def prev_page():
st.session_state.offset -= tweets_per_page
st.session_state.update_component += 1
scroll_into_view()
+
def next_page():
st.session_state.offset += tweets_per_page
#scroll to top config
st.session_state.update_component += 1
scroll_into_view()
-
+
+
# UI
-st.title('Wayback Tweets [](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)', anchor=False)
-st.write('Display multiple archived tweets on Wayback Machine and avoid opening each link manually')
+st.title(
+ 'Wayback Tweets [](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)',
+ anchor=False)
+st.write(
+ 'Display multiple archived tweets on Wayback Machine and avoid opening each link manually'
+)
handle = st.text_input('Username', placeholder='jack')
-st.session_state.saved_at = st.slider('Tweets saved between', 2006, year, (2006, year))
+st.session_state.saved_at = st.slider('Tweets saved between', 2006, year,
+ (2006, year))
-not_available = st.checkbox('Original URLs not available', help='Due to changes in X, it is possible to find available tweets if you are logged into X')
+not_available = st.checkbox(
+ 'Original URLs not available',
+ help=
+ 'Due to changes in X, it is possible to find available tweets if you are logged into X'
+)
query = st.button('Query', type='primary', use_container_width=True)
st.session_state.count = tweets_count(handle, st.session_state.saved_at)
- st.caption('The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit.')
+ st.caption(
+ 'The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit.'
+ )
st.write(f'**{st.session_state.count} URLs have been captured**')
if st.session_state.count:
try:
progress = st.empty()
- links = query_api(handle, tweets_per_page, st.session_state.offset, st.session_state.saved_at)
+ links = query_api(handle, tweets_per_page, st.session_state.offset,
+ st.session_state.saved_at)
parse = parse_links(links)
parsed_links = parse[0]
st.session_state.current_handle = handle
return_none_count = 0
-
+
start_index = st.session_state.offset
- end_index = min(st.session_state.count, start_index + tweets_per_page)
+ end_index = min(st.session_state.count,
+ start_index + tweets_per_page)
with st.spinner('Fetching...'):
for i in range(tweets_per_page):
tweet = embed(tweet_links[i])
status = check_double_status(link, tweet_links[i])
-
+
if not not_available:
attr(i)
display_not_tweet()
- progress.write(f'{return_none_count} URLs have been captured in the range {start_index}-{end_index}')
+ progress.write(
+ f'{return_none_count} URLs have been captured in the range {start_index}-{end_index}'
+ )
if start_index <= 0:
st.session_state.prev_disabled = True
st.session_state.next_disabled = True
- prev, _ , next = st.columns([3, 4, 3])
-
- prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
- next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)
+ prev, _, next = st.columns([3, 4, 3])
+
+ prev.button('Previous',
+ disabled=st.session_state.prev_disabled,
+ key='prev_button_key',
+ on_click=prev_page,
+ type='primary',
+ use_container_width=True)
+ next.button('Next',
+ disabled=st.session_state.next_disabled,
+ key='next_button_key',
+ on_click=next_page,
+ type='primary',
+ use_container_width=True)
if not links:
st.error('Unable to query the Wayback Machine API.')