update streamlit version, add filter by year, add filter by range size
authorClaromes <claromes@hey.com>
Thu, 28 Sep 2023 05:47:33 +0000 (02:47 -0300)
committerClaromes <claromes@hey.com>
Thu, 28 Sep 2023 05:47:33 +0000 (02:47 -0300)
README.md
app.py
requirements.txt

index b9853f95f6d9fbd408aa26b5e826d50afbb904b6..a8352e46c98ed1c234bb5366306b9beb99e41dde 100644 (file)
--- a/README.md
+++ b/README.md
@@ -58,12 +58,12 @@ Streamlit will be served at http://localhost:8501
 - [x] Review data cache
 - [x] Changelog
 - [ ] Prevent duplicate URLs
-- [ ] Range size defined by user
+- [x] Range size defined by user
 - [ ] `parse_links` exception
 - [ ] Add current page to page title
 - [ ] Parse MIME type `warc/revisit`
 - [ ] Parse MIME type `text/plain`
-- [ ] Filter by period/datetime
+- [x] Filter by period/datetime
 - [ ] Apply filters by API endpoints
 - [ ] Add contributing guidelines
 
diff --git a/app.py b/app.py
index f1e6e8f9b404bf51944a2a6246a17cb4ca0466d5..6bbb22cacf1ea9a9222a4f903bb4b65cee93a926 100644 (file)
--- a/app.py
+++ b/app.py
@@ -70,6 +70,9 @@ if 'update_component' not in st.session_state:
 if 'offset' not in st.session_state:
     st.session_state.offset = 0
 
+if 'date_created' not in st.session_state:
+    st.session_state.date_created = (2006, year)
+
 def scroll_into_view():
     js = '''
     <script>
@@ -126,11 +129,9 @@ def embed(tweet):
     except requests.exceptions.Timeout:
         st.error('Connection to publish.twitter.com timed out.')
 
-
-
 @st.cache_data(ttl=1800, show_spinner=False)
-def tweets_count(handle):
-    url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle)
+def tweets_count(handle, date_created):
+    url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json&from={}&to={}'.format(handle,  date_created[0], date_created[1])
     try:
         response = requests.get(url)
 
@@ -144,14 +145,13 @@ def tweets_count(handle):
     except requests.exceptions.Timeout:
         st.error('Connection to web.archive.org timed out.')
 
-
 @st.cache_data(ttl=1800, show_spinner=False)
-def query_api(handle, limit, offset):
+def query_api(handle, limit, offset, date_created):
     if not handle:
         st.warning('username, please!')
         st.stop()
 
-    url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json&limit={}&offset={}'.format(handle, limit, offset)
+    url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json&limit={}&offset={}&from={}&to={}'.format(handle, limit, offset, date_created[0], date_created[1])
     try:
         response = requests.get(url)
 
@@ -183,39 +183,39 @@ def attr(i):
     '''.format(i+1 + st.session_state.offset, link, mimetype[i], datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S"), tweet_links[i]))
 
 # UI
-st.title('''
-Wayback Tweets [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases)
-''', anchor=False)
-st.write('''
-Display multiple archived tweets on Wayback Machine and avoid opening each link manually
+st.title('Wayback Tweets [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)', anchor=False)
+st.write('Display multiple archived tweets on Wayback Machine via Wayback CDX Server API')
+
+handle = st.text_input('Username', placeholder='jack')
 
-*via Wayback CDX Server API*
-''')
+st.session_state.date_created = st.slider('Tweets created between', 2006, year, (2006, year))
+
+tweets_per_page = st.slider('Tweets per page', 25, 1000, 25, 25)
+
+only_deleted = st.checkbox('Only deleted tweets')
 
-handle = st.text_input('username', placeholder='username', label_visibility='collapsed')
 query = st.button('Query', type='primary', use_container_width=True)
 
 bar = st.empty()
 
-if query or handle:
+if query or handle :
     if handle != st.session_state.current_handle:
         st.session_state.offset = 0
 
     if query != st.session_state.current_query:
         st.session_state.offset = 0
 
-    count = tweets_count(handle)
+    count = tweets_count(handle, st.session_state.date_created)
 
     st.write('**{} URLs have been captured**'.format(count))
 
-    tweets_per_page = 30
-
-    only_deleted = st.checkbox('Only deleted tweets')
+    if tweets_per_page > count:
+        tweets_per_page = count
 
     try:
         bar.progress(0)
         progress = st.empty()
-        links = query_api(handle, tweets_per_page, st.session_state.offset)
+        links = query_api(handle, tweets_per_page, st.session_state.offset, st.session_state.date_created)
 
         parse = parse_links(links)
         parsed_links = parse[0]
@@ -249,7 +249,7 @@ if query or handle:
                 if is_RT[0] == True:
                     st.info('*Retweet*')
                 st.write(tweet_content[0])
-                st.write(user_info[0])
+                st.write(f'**{user_info[0]}**')
 
                 st.divider()
 
@@ -257,11 +257,14 @@ if query or handle:
                 if mimetype[i] == 'application/json':
                     st.error('Tweet has been deleted.')
                     response_json = requests.get(link)
-                    json_data = response_json.json()
-                    json_text = response_json.json()['text']
+                    if response_json.status_code == 200:
+                        json_data = response_json.json()
+                        json_text = response_json.json()['text']
 
-                    st.code(json_text)
-                    st.json(json_data, expanded=False)
+                        st.code(json_text)
+                        st.json(json_data, expanded=False)
+                    else:
+                        st.error(response_json.status_code)
 
                     st.divider()
                 if mimetype[i] == 'text/html':
index dff5d86e24b756c9ce7b2d569c541b2112a0b2c6..6735d209a3a8118955d0ad8539a5793be8f11b8d 100644 (file)
@@ -1,2 +1,2 @@
 requests==2.30.0
-streamlit==1.25.0
\ No newline at end of file
+streamlit==1.27.0
\ No newline at end of file