add progress bar, refactoring
authorClaromes <claromes@hey.com>
Sun, 6 Aug 2023 01:01:31 +0000 (22:01 -0300)
committerClaromes <claromes@hey.com>
Sun, 6 Aug 2023 01:01:31 +0000 (22:01 -0300)
README.md
app.py

index 8205dfe0fc6ad8861aa83b4c4fc1ca33aeca2956..c7cf677dae13cc815d2fc2ea0d38e71b97a3b46c 100644 (file)
--- a/README.md
+++ b/README.md
@@ -42,6 +42,8 @@ Streamlit will be served at http://localhost:8501
 - [x] `only_deleted` checkbox selected for handles without deleted tweets
 - [x] Pagination: set session variable on first click
 - [x] Pagination: scroll to top
+- [ ] `IndexError`
+- [ ] Timeout error
 
 ## Roadmap
 
@@ -56,5 +58,7 @@ Streamlit will be served at http://localhost:8501
 - [ ] Range size defined by user
 - [ ] `parse_links` exception
 - [ ] Add current page to page title
+- [ ] Parse MIME type `warc/revisit`
+- [ ] Filter by period/datetime
 
 ## [Changelog](/CHANGELOG.md)
diff --git a/app.py b/app.py
index d02569efc0954e12d83eefad37b5a91852d14f5f..299dddca5c2e03c124518d5cd8e1718178466419 100644 (file)
--- a/app.py
+++ b/app.py
@@ -4,6 +4,7 @@ import streamlit as st
 import streamlit.components.v1 as components
 import json
 import re
+from bs4 import BeautifulSoup
 
 __version__ = '0.2'
 
@@ -39,6 +40,16 @@ hide_streamlit_style = '''
     header[data-testid="stHeader"] {
         opacity: 0.5;
     }
+    div[data-testid="stDecoration"] {
+        visibility: hidden;
+        height: 0%;
+        position: fixed;
+    }
+    div[data-testid="stStatusWidget"] {
+        visibility: hidden;
+        height: 0%;
+        position: fixed;
+    }
 </style>
 '''
 
@@ -99,6 +110,8 @@ def embed(tweet):
 
             for match in matches_html:
                 tweet_content_match = re.sub(r'<a[^>]*>|<\/a>', '', match[0].strip())
+                tweet_content_match = tweet_content_match.replace('<br>', '\n')
+
                 user_info_match = re.sub(r'<a[^>]*>|<\/a>', '', match[1].strip())
                 user_info_match = user_info_match.replace(')', '), ')
 
@@ -120,7 +133,7 @@ def embed(tweet):
         else:
             return False
     except requests.exceptions.Timeout:
-        st.error('Connection to web.archive.org timed out.')
+        st.error('Connection to publish.twitter.com timed out.')
 
 
 
@@ -128,7 +141,7 @@ def embed(tweet):
 def tweets_count(handle):
     url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle)
     try:
-        response = requests.get(url, timeout=5)
+        response = requests.get(url, timeout=10)
 
         if response.status_code == 200:
             data = response.json()
@@ -191,6 +204,8 @@ Display multiple archived tweets on Wayback Machine and avoid opening each link
 handle = st.text_input('username', placeholder='username', label_visibility='collapsed')
 query = st.button('Query', type='primary', use_container_width=True)
 
+bar = st.progress(0)
+
 if query or handle:
     if handle != st.session_state.current_handle:
         st.session_state.offset = 0
@@ -207,130 +222,119 @@ if query or handle:
     only_deleted = st.checkbox('Only deleted tweets')
 
     try:
-        with st.spinner(''):
-            progress = st.empty()
-            links = query_api(handle, tweets_per_page, st.session_state.offset)
-            parsed_links = parse_links(links)[0]
-            tweet_links = parse_links(links)[1]
-            mimetype = parse_links(links)[2]
-            timestamp = parse_links(links)[3]
+        progress = st.empty()
+        links = query_api(handle, tweets_per_page, st.session_state.offset)
+        parsed_links = parse_links(links)[0]
+        tweet_links = parse_links(links)[1]
+        mimetype = parse_links(links)[2]
+        timestamp = parse_links(links)[3]
+
+
+        if links:
+            st.divider()
+
+            st.session_state.current_handle = handle
+            st.session_state.current_query = query
+
+            return_none_count = 0
+
+            def prev_page():
+                st.session_state.offset -= tweets_per_page
 
+                #scroll to top config
+                st.session_state.update_component += 1
+                scroll_into_view()
+
+            def next_page():
+                st.session_state.offset += tweets_per_page
+
+                #scroll to top config
+                st.session_state.update_component += 1
+                scroll_into_view()
+
+            def display_tweet():
+                if is_RT[0] == True:
+                    st.info('*Retweet*')
+                st.write(tweet_content[0])
+                st.write(user_info[0])
 
-            if links:
                 st.divider()
 
-                st.session_state.current_handle = handle
-                st.session_state.current_query = query
+            def display_not_tweet():
+                if mimetype[i] == 'application/json':
+                    st.error('Tweet has been deleted.')
+                    response = requests.get(link, timeout=5)
+                    json_data = response.json()
+
+                    st.json(json_data, expanded=False)
+
+                    st.divider()
+                if mimetype[i] == 'text/html':
+                    st.error('Tweet has been deleted.')
+                    components.iframe(link, height=500)
 
-                return_none_count = 0
+                    st.divider()
 
-                def prev_page():
-                    st.session_state.offset -= tweets_per_page
+            start_index = st.session_state.offset
+            end_index = min(count, start_index + tweets_per_page)
 
-                    #scroll to top config
-                    st.session_state.update_component += 1
-                    scroll_into_view()
+            for i in range(tweets_per_page):
+                try:
+                    bar.progress((i*3) + 13)
 
-                def next_page():
-                    st.session_state.offset += tweets_per_page
+                    link = parsed_links[i]
+                    tweet = embed(tweet_links[i])
 
-                    #scroll to top config
-                    st.session_state.update_component += 1
-                    scroll_into_view()
+                    if not only_deleted:
+                        attr(i)
 
-                start_index = st.session_state.offset
-                end_index = min(count, start_index + tweets_per_page)
+                        if tweet:
+                            status_code = tweet[0]
+                            tweet_content = tweet[1]
+                            user_info = tweet[2]
+                            is_RT = tweet[3]
 
-                for i in range(tweets_per_page):
-                    try:
-                        link = parsed_links[i]
-                        tweet = embed(tweet_links[i])
+                            if mimetype[i] == 'application/json':
+                                display_tweet()
 
-                        if not only_deleted:
+                            if mimetype[i] == 'text/html':
+                                display_tweet()
+                        elif not tweet:
+                            display_not_tweet()
+
+                    if only_deleted:
+                        if not tweet:
+                            return_none_count += 1
                             attr(i)
 
-                            if tweet:
-                                status_code = tweet[0]
-                                tweet_content = tweet[1]
-                                user_info = tweet[2]
-                                is_RT = tweet[3]
-
-                                if mimetype[i] == 'application/json':
-                                    if is_RT[0] == True:
-                                        st.info('*Retweet*')
-                                    st.write(tweet_content[0])
-                                    st.write(user_info[0])
-
-                                    st.divider()
-                                if mimetype[i] == 'text/html':
-                                    if is_RT[0] == True:
-                                        st.info('*Retweet*')
-                                    st.write(tweet_content[0])
-                                    st.write(user_info[0])
-
-                                    st.divider()
-                            elif not tweet:
-                                if mimetype[i] == 'application/json':
-                                    st.error('Tweet has been deleted.')
-                                    response = requests.get(link, timeout=5)
-                                    json_data = response.json()
-
-                                    st.json(json_data, expanded=False)
-
-                                    st.divider()
-                                if mimetype[i] == 'text/html':
-                                    st.error('Tweet has been deleted.')
-                                    st.info('IFRAME')
-                                    st.write(link)
-
-                                    st.divider()
-
-                        if only_deleted:
-                            if not tweet:
-                                return_none_count += 1
-                                attr(i)
-
-                                if mimetype[i] == 'application/json':
-                                    st.error('Tweet has been deleted.')
-                                    response = requests.get(link, timeout=5)
-                                    json_data = response.json()
-
-                                    st.json(json_data, expanded=False)
-
-                                    st.divider()
-                                if mimetype[i] == 'text/html':
-                                    st.error('Tweet has been deleted.')
-                                    st.info('IFRAME')
-                                    st.write(link)
-
-                                    st.divider()
-
-                            progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index))
-
-                        if start_index <= 0:
-                            st.session_state.prev_disabled = True
-                        else:
-                            st.session_state.prev_disabled = False
-
-                        if i + 1 == count:
-                            st.session_state.next_disabled = True
-                        else:
-                            st.session_state.next_disabled = False
-                    except IndexError:
-                        if start_index <= 0:
-                            st.session_state.prev_disabled = True
-                        else:
-                            st.session_state.prev_disabled = False
+                            display_not_tweet()
+
+                        progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index))
 
+                    if start_index <= 0:
+                        st.session_state.prev_disabled = True
+                    else:
+                        st.session_state.prev_disabled = False
+
+                    if i + 1 == count:
                         st.session_state.next_disabled = True
+                    else:
+                        st.session_state.next_disabled = False
+                except IndexError:
+                    if start_index <= 0:
+                        st.session_state.prev_disabled = True
+                    else:
+                        st.session_state.prev_disabled = False
+
+                    st.session_state.next_disabled = True
 
-                prev, _ , next = st.columns([3, 4, 3])
+            prev, _ , next = st.columns([3, 4, 3])
 
-                prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
-                next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)
+            prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
+            next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)
 
-            if not links:
-                st.error('Unable to query the Wayback Machine API.')
+        if not links:
+            st.error('Unable to query the Wayback Machine API.')
     except TypeError as e:
         st.error('''
         {}. Refresh this page and try again.