From: Claromes Date: Sun, 6 Aug 2023 01:01:31 +0000 (-0300) Subject: add progress bar, refactoring X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=09581f0df68aff027ee870d940b2f773ba891a1e;p=waybacktweets.git add progress bar, refactoring --- diff --git a/README.md b/README.md index 8205dfe..c7cf677 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ Streamlit will be served at http://localhost:8501 - [x] `only_deleted` checkbox selected for handles without deleted tweets - [x] Pagination: set session variable on first click - [x] Pagination: scroll to top +- [ ] `IndexError` +- [ ] Timeout error ## Roadmap @@ -56,5 +58,7 @@ Streamlit will be served at http://localhost:8501 - [ ] Range size defined by user - [ ] `parse_links` exception - [ ] Add current page to page title +- [ ] Parse MIME type `warc/revisit` +- [ ] Filter by period/datetime ## [Changelog](/CHANGELOG.md) diff --git a/app.py b/app.py index d02569e..299dddc 100644 --- a/app.py +++ b/app.py @@ -4,6 +4,7 @@ import streamlit as st import streamlit.components.v1 as components import json import re +from bs4 import BeautifulSoup __version__ = '0.2' @@ -39,6 +40,16 @@ hide_streamlit_style = ''' header[data-testid="stHeader"] { opacity: 0.5; } + div[data-testid="stDecoration"] { + visibility: hidden; + height: 0%; + position: fixed; + } + div[data-testid="stStatusWidget"] { + visibility: hidden; + height: 0%; + position: fixed; + } ''' @@ -99,6 +110,8 @@ def embed(tweet): for match in matches_html: tweet_content_match = re.sub(r']*>|<\/a>', '', match[0].strip()) + tweet_content_match = tweet_content_match.replace('
', '\n') + user_info_match = re.sub(r']*>|<\/a>', '', match[1].strip()) user_info_match = user_info_match.replace(')', '), ') @@ -120,7 +133,7 @@ def embed(tweet): else: return False except requests.exceptions.Timeout: - st.error('Connection to web.archive.org timed out.') + st.error('Connection to publish.twitter.com timed out.') @@ -128,7 +141,7 @@ def embed(tweet): def tweets_count(handle): url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle) try: - response = requests.get(url, timeout=5) + response = requests.get(url, timeout=10) if response.status_code == 200: data = response.json() @@ -191,6 +204,8 @@ Display multiple archived tweets on Wayback Machine and avoid opening each link handle = st.text_input('username', placeholder='username', label_visibility='collapsed') query = st.button('Query', type='primary', use_container_width=True) +bar = st.progress(0) + if query or handle: if handle != st.session_state.current_handle: st.session_state.offset = 0 @@ -207,130 +222,119 @@ if query or handle: only_deleted = st.checkbox('Only deleted tweets') try: - with st.spinner(''): - progress = st.empty() - links = query_api(handle, tweets_per_page, st.session_state.offset) - parsed_links = parse_links(links)[0] - tweet_links = parse_links(links)[1] - mimetype = parse_links(links)[2] - timestamp = parse_links(links)[3] + progress = st.empty() + links = query_api(handle, tweets_per_page, st.session_state.offset) + parsed_links = parse_links(links)[0] + tweet_links = parse_links(links)[1] + mimetype = parse_links(links)[2] + timestamp = parse_links(links)[3] + + + if links: + st.divider() + + st.session_state.current_handle = handle + st.session_state.current_query = query + + return_none_count = 0 + + def prev_page(): + st.session_state.offset -= tweets_per_page + #scroll to top config + st.session_state.update_component += 1 + scroll_into_view() + + def next_page(): + st.session_state.offset += tweets_per_page + + #scroll to top config + st.session_state.update_component += 1 + scroll_into_view() + + def display_tweet(): + if is_RT[0] == True: + st.info('*Retweet*') + st.write(tweet_content[0]) + st.write(user_info[0]) - if links: st.divider() - st.session_state.current_handle = handle - st.session_state.current_query = query + def display_not_tweet(): + if mimetype[i] == 'application/json': + st.error('Tweet has been deleted.') + response = requests.get(link, timeout=5) + json_data = response.json() + + st.json(json_data, expanded=False) + + st.divider() + if mimetype[i] == 'text/html': + st.error('Tweet has been deleted.') + components.iframe(link, height=500) - return_none_count = 0 + st.divider() - def prev_page(): - st.session_state.offset -= tweets_per_page + start_index = st.session_state.offset + end_index = min(count, start_index + tweets_per_page) - #scroll to top config - st.session_state.update_component += 1 - scroll_into_view() + for i in range(tweets_per_page): + try: + bar.progress((i*3) + 13) - def next_page(): - st.session_state.offset += tweets_per_page + link = parsed_links[i] + tweet = embed(tweet_links[i]) - #scroll to top config - st.session_state.update_component += 1 - scroll_into_view() + if not only_deleted: + attr(i) - start_index = st.session_state.offset - end_index = min(count, start_index + tweets_per_page) + if tweet: + status_code = tweet[0] + tweet_content = tweet[1] + user_info = tweet[2] + is_RT = tweet[3] - for i in range(tweets_per_page): - try: - link = parsed_links[i] - tweet = embed(tweet_links[i]) + if mimetype[i] == 'application/json': + display_tweet() - if not only_deleted: + if mimetype[i] == 'text/html': + display_tweet() + elif not tweet: + display_not_tweet() + + if only_deleted: + if not tweet: + return_none_count += 1 attr(i) - if tweet: - status_code = tweet[0] - tweet_content = tweet[1] - user_info = tweet[2] - is_RT = tweet[3] - - if mimetype[i] == 'application/json': - if is_RT[0] == True: - st.info('*Retweet*') - st.write(tweet_content[0]) - st.write(user_info[0]) - - st.divider() - if mimetype[i] == 'text/html': - if is_RT[0] == True: - st.info('*Retweet*') - st.write(tweet_content[0]) - st.write(user_info[0]) - - st.divider() - elif not tweet: - if mimetype[i] == 'application/json': - st.error('Tweet has been deleted.') - response = requests.get(link, timeout=5) - json_data = response.json() - - st.json(json_data, expanded=False) - - st.divider() - if mimetype[i] == 'text/html': - st.error('Tweet has been deleted.') - st.info('IFRAME') - st.write(link) - - st.divider() - - if only_deleted: - if not tweet: - return_none_count += 1 - attr(i) - - if mimetype[i] == 'application/json': - st.error('Tweet has been deleted.') - response = requests.get(link, timeout=5) - json_data = response.json() - - st.json(json_data, expanded=False) - - st.divider() - if mimetype[i] == 'text/html': - st.error('Tweet has been deleted.') - st.info('IFRAME') - st.write(link) - - st.divider() - - progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index)) - - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False - - if i + 1 == count: - st.session_state.next_disabled = True - else: - st.session_state.next_disabled = False - except IndexError: - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False + display_not_tweet() + + progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index)) + if start_index <= 0: + st.session_state.prev_disabled = True + else: + st.session_state.prev_disabled = False + + if i + 1 == count: st.session_state.next_disabled = True + else: + st.session_state.next_disabled = False + except IndexError: + if start_index <= 0: + st.session_state.prev_disabled = True + else: + st.session_state.prev_disabled = False + + st.session_state.next_disabled = True - prev, _ , next = st.columns([3, 4, 3]) + prev, _ , next = st.columns([3, 4, 3]) - prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True) - next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True) + prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True) + next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True) - if not links: - st.error('Unable to query the Wayback Machine API.') + if not links: + st.error('Unable to query the Wayback Machine API.') except TypeError as e: st.error(''' {}. Refresh this page and try again.