--- /dev/null
+[theme]
+base = "light"
+primaryColor = "black"
+secondaryBackgroundColor = "gainsboro"
+textColor = "black"
+backgroundColor = "whitesmoke"
+font = "sans serif"
+
+[client]
+toolbarMode = "minimal"
+
+[server]
+port = 8501
# Wayback Tweets
-[](https://pypi.org/project/waybacktweets)
+[](https://pypi.org/project/waybacktweets) [](https://waybacktweets.streamlit.app)
Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats.
timestamp_to = parse_date("20191231")
limit = 250
offset = 0
+matchtype = "exact"
-api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset)
+api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype)
archived_tweets = api.get()
```
+++ /dev/null
-[theme]
-base = "light"
-primaryColor = "#ef5552"
-secondaryBackgroundColor = "#efefef"
-textColor = "#000000"
-backgroundColor = "#f9f9f9"
-font = "serif"
-
-[client]
-displayEnabled = true
-toolbarMode = "minimal"
import streamlit.components.v1 as components
from waybacktweets.api.export_tweets import TweetsExporter
-from waybacktweets.api.parse_tweets import TweetsParser
+from waybacktweets.api.parse_tweets import JsonParser, TweetsParser
from waybacktweets.api.request_tweets import WaybackTweets
-from waybacktweets.utils.utils import check_double_status, get_response
+from waybacktweets.utils.utils import (
+ check_double_status,
+ get_response,
+ semicolon_parser,
+)
# Initial Settings
page_icon=LOGO,
layout="centered",
menu_items={
+ "About": f"""
+ [](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [](https://github.com/claromes/waybacktweets)
+
+ Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
+
+ The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html).
+
+ © Copyright 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library
+
+ ---
+""", # noqa: E501
"Report a bug": "https://github.com/claromes/waybacktweets/issues",
},
)
visibility: hidden;
}
img[data-testid="stLogo"] {
- scale: 3;
+ scale: 4;
padding-left: 10px;
}
</style>
st.logo(LOGO)
st.success(
- """**New Feature: CLI**
+ """**v1.0 🎉: CLI and Python Module**
+
+$ `pip install waybacktweets`
+
+$ `waybacktweets --from 20150101 --to 20191231 --limit 250 jack`
-You can now retrieve archived tweets using the Wayback Tweets command line tool.
-Download the archived tweets CDX data in CSV, JSON, and HTML formats.
+Retrieve archived tweets CDX data in CSV, JSON, and HTML formats using the command line.
-For more details, [read the documentation](https://claromes.github.io/waybacktweets).""" # noqa: E501
+Read the documentation: [claromes.github.io/waybacktweets](https://claromes.github.io/waybacktweets).""" # noqa: E501
)
st.title(
st.caption(
"[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)" # noqa: E501
)
-st.caption("Display multiple archived tweets on Wayback Machine")
+st.caption("Display multiple archived tweets on Wayback Machine.")
+st.caption(
+ "Download data via command line with the [`waybacktweets`](https://pypi.org/project/waybacktweets) Python package." # noqa: E501
+)
username = st.text_input("Username", placeholder="Without @")
start_date,
end_date,
format="YYYY/MM/DD",
- help="YYYY/MM/DD",
+ help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
)
-not_available = st.checkbox("Only tweets not available")
+not_available = st.checkbox(
+ "Only tweets not available",
+ help="Checks if the archived URL still exists on Twitter",
+)
unique = st.checkbox(
"Only unique URLs",
- help="Filtering by the collapse option using the urlkey field",
+ help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
)
query = st.button("Query", type="primary", use_container_width=True)
# Tweet Listing Settings
-
if username != st.session_state.current_username:
st.session_state.current_username = username
st.session_state.offset = 0
username, st.session_state.archived_timestamp_filter
)
- st.caption(
- "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
- )
- st.write(f"**{st.session_state.count} URLs have been captured**")
+ # st.caption(
+ # "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
+ # )
+ # st.write(f"**{st.session_state.count} URLs have been captured**")
if st.session_state.count:
if tweets_per_page > st.session_state.count:
tweets_per_page = st.session_state.count
try:
- progress = st.empty()
-
# Tweet Listing Processing
+ progress = st.empty()
+
collapse = None
+ matchType = None
if unique:
collapse = "urlkey"
-
- response = WaybackTweets(
- username,
- collapse,
- st.session_state.archived_timestamp_filter[0],
- st.session_state.archived_timestamp_filter[1],
- tweets_per_page,
- st.session_state.offset,
- )
- archived_tweets = response.get()
+ matchType = "prefix"
with st.spinner("Parsing..."):
+ response = WaybackTweets(
+ username,
+ collapse,
+ st.session_state.archived_timestamp_filter[0],
+ st.session_state.archived_timestamp_filter[1],
+ tweets_per_page,
+ st.session_state.offset,
+ matchType,
+ )
+ archived_tweets = response.get()
+
if archived_tweets:
field_options = [
"archived_urlkey",
"archived_tweet_url",
"parsed_tweet_url",
"parsed_archived_tweet_url",
- "parsed_tweet_text_mimetype_json",
"available_tweet_text",
"available_tweet_is_RT",
"available_tweet_info",
exporter = TweetsExporter(parsed_tweets, username, field_options)
df = exporter.dataframe
- # file_path = "claromes_tweets_20240610210338.csv"
- # df = pd.read_csv(file_path)
- # df = df.fillna("")
-
archived_urlkey = df["archived_urlkey"]
archived_timestamp = df["archived_timestamp"]
original_tweet_url = df["original_tweet_url"]
archived_tweet_url = df["archived_tweet_url"]
parsed_tweet_url = df["parsed_tweet_url"]
parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
- parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"]
available_tweet_text = df["available_tweet_text"]
available_tweet_is_RT = df["available_tweet_is_RT"]
available_tweet_info = df["available_tweet_info"]
archived_statuscode = df["archived_statuscode"]
st.divider()
-
st.session_state.current_username = username
return_none_count = 0
for i in range(tweets_per_page):
try:
- if original_tweet_url[i]:
+ if archived_mimetype[i] == "application/json":
+ json_parser = JsonParser(parsed_archived_tweet_url[i])
+ text_json = json_parser.parse()
- # Display all tweets
- if not not_available:
+ if text_json:
+ parsed_text_json = semicolon_parser(text_json)
+
+ # Display all tweets
+ if not not_available:
+ # Display available tweets
+ if available_tweet_text[i]:
st.markdown(
- f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
+ f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
)
- # Display available tweets
- if available_tweet_text[i]:
- if available_tweet_is_RT[i]:
- st.info("*Retweet*")
-
- st.write(available_tweet_text[i])
- st.write(f"**{available_tweet_info[i]}**")
+ if available_tweet_is_RT[i]:
+ st.info("*Retweet*")
- st.divider()
+ st.write(available_tweet_text[i])
+ st.write(f"**{available_tweet_info[i]}**")
- # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501
- elif (
- (
- archived_mimetype[i] != "application/json"
- and not parsed_tweet_text_mimetype_json[i]
+ st.divider()
+ # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501
+ elif (
+ archived_mimetype[i] != "application/json"
+ and not available_tweet_text[i]
+ ):
+ st.markdown(
+ f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
+ )
+ if (
+ ".jpg" in original_tweet_url[i]
+ or ".png" in original_tweet_url[i]
+ ) and (400 <= archived_statuscode[i] <= 511):
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
)
- and not available_tweet_text[i]
- ) or (
- (
- archived_mimetype[i] == "application/json"
- and not parsed_tweet_text_mimetype_json[i]
+ elif "/status/" not in original_tweet_url[i]:
+ st.info(
+ "This isn't a status or is not available" # noqa: E501
)
- and not available_tweet_text[i]
- ):
- if (
- ".jpg" in original_tweet_url[i]
- or ".png" in original_tweet_url[i]
- ) and (400 <= archived_statuscode[i] <= 511):
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
- elif "/status/" not in original_tweet_url[i]:
- st.info(
- "This isn't a status or is not available"
- )
- elif (
- check_double_status(
- archived_tweet_url[i], original_tweet_url[i]
- )
- or f"{st.session_state.current_username}"
- not in original_tweet_url[i]
- ):
- st.info(
- f"Replying to {st.session_state.current_username}" # noqa: E501
- )
- else:
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
-
- st.divider()
-
- # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501
elif (
- archived_mimetype[i] == "application/json"
- and parsed_tweet_text_mimetype_json[i]
- ) and not available_tweet_text[i]:
- st.code(parsed_tweet_text_mimetype_json[i])
- # st.json(json_data, expanded=False)
+ check_double_status(
+ archived_tweet_url[i],
+ original_tweet_url[i],
+ )
+ or f"{st.session_state.current_username}"
+ not in original_tweet_url[i]
+ ):
+ st.info(
+ f"Replying to {st.session_state.current_username}" # noqa: E501
+ )
+ else:
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
+
+ st.divider()
+ # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501
+ elif (
+ archived_mimetype[i] == "application/json"
+ and not available_tweet_text[i]
+ ):
+ st.markdown(
+ f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
+ )
+ st.code(parsed_text_json)
- st.divider()
+ st.divider()
- # Display only tweets not available
- if not_available:
+ # Display only tweets not available
+ if not_available:
+ # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
+ if (
+ archived_mimetype[i] != "application/json"
+ and not available_tweet_text[i]
+ ):
return_none_count += 1
st.markdown(
- f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
+ f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
)
-
- # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
if (
- archived_mimetype[i] != "application/json"
- and not available_tweet_text[i]
- ):
- if (
- ".jpg" in original_tweet_url[i]
- or ".png" in original_tweet_url[i]
- ) and (400 <= archived_statuscode[i] <= 511):
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
- elif "/status/" not in original_tweet_url[i]:
- st.info(
- "This isn't a status or is not available"
- )
- elif (
- check_double_status(
- archived_tweet_url[i], original_tweet_url[i]
- )
- or f"{st.session_state.current_username}"
- not in original_tweet_url[i]
- ):
- st.info(
- f"Replying to {st.session_state.current_username}" # noqa: E501
- )
- else:
- components.iframe(
- archived_tweet_url[i],
- height=500,
- scrolling=True,
- )
-
- st.divider()
-
- # Display tweets not available with application/json return # noqa: E501
+ ".jpg" in original_tweet_url[i]
+ or ".png" in original_tweet_url[i]
+ ) and (400 <= archived_statuscode[i] <= 511):
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
+ elif "/status/" not in original_tweet_url[i]:
+ st.info(
+ "This isn't a status or is not available" # noqa: E501
+ )
elif (
- archived_mimetype[i] == "application/json"
- and not available_tweet_text[i]
+ check_double_status(
+ archived_tweet_url[i],
+ original_tweet_url[i],
+ )
+ or f"{st.session_state.current_username}"
+ not in original_tweet_url[i]
):
- st.code(parsed_tweet_text_mimetype_json[i])
- # st.json(json_data, expanded=False)
+ st.info(
+ f"Replying to {st.session_state.current_username}" # noqa: E501
+ )
+ else:
+ components.iframe(
+ archived_tweet_url[i],
+ height=500,
+ scrolling=True,
+ )
- st.divider()
+ st.divider()
- progress.write(
- f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
+ # Display tweets not available with application/json return # noqa: E501
+ elif (
+ archived_mimetype[i] == "application/json"
+ and not available_tweet_text[i]
+ ):
+ return_none_count += 1
+
+ st.markdown(
+ f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501
)
- pass
+ st.code(parsed_text_json)
+
+ st.divider()
+
+ progress.write(
+ f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
+ )
except IndexError:
if start_index <= 0:
st.session_state.prev_disabled = True
)
if not archived_tweets:
- st.error("Unable to query the Wayback Machine API.")
+ st.error(
+ "Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
+ )
except TypeError as e:
st.error(
f"""
<svg width="400" height="400" viewBox="0 0 400 400" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M66.3377 155.388C65.7708 151.515 66.1155 142.825 71.4265 141.899C79.0689 140.57 79.2956 146.18 84.6566 147.58C96.4535 150.665 174.292 120.705 191.564 116.358C198.101 114.713 198.198 105 210.563 105C216.167 105 212.949 113.528 227.103 117.659C231.821 119.036 321.537 144.755 335.638 153.613C336.375 154.076 342.097 149.711 345.475 149.711C349.585 149.711 348.061 156.318 347.512 158.231" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M76.0438 157.18C104.715 153.671 133.644 159.144 162.337 159.144C180.702 159.144 198.876 159.796 217.155 160.78C232.592 161.607 248.391 161.432 263.851 161.432C268.61 161.432 339.399 162.499 339.992 163.068C341.344 164.376 339.313 175.156 339.313 177.794C339.313 179.238 340.507 188.39 339.992 188.592C312.456 199.24 148.067 185.975 105.484 185.975C100.317 185.975 74.2748 188.841 70.6327 185.319C69.0401 183.779 71.0896 170.186 71.3068 167.649" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M50.2399 286.757C48.2835 274.679 58.8072 280.438 63.1705 276.216C65.5956 273.869 62.1377 268.977 67.2563 267.326C86.9115 260.985 201.221 265.128 226.173 267.326C254.045 269.778 307.073 262.416 331.665 271.936C337.53 274.208 346.898 286.066 349.359 294" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M78.54 192.729C75.2103 217.791 74.8765 243.244 74.8765 268.404" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M95.6602 189.392C95.5489 213.231 94.1778 238.972 96.1587 261.728" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M120.11 189.392C117.82 213.426 116.295 237.66 114.546 261.728" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M133.018 191.617C130.467 214.481 133.489 242.725 131.687 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M156.834 190.504C155.165 212.389 152.383 234.105 152.383 256.163" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M170.257 192.729C170.227 216.032 169.009 239.434 169.009 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M127.227 203.675C125.215 203.711 123.631 203.17 121.895 202.927" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M164.531 202.629C161.934 203.12 160.138 201.951 158.041 201.631" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M194.64 190.504C194.317 211.918 192.477 233.888 192.477 255.05" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M211.075 192.729C210.357 215.678 210.574 238.717 210.574 261.727" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M234.735 198.294C230.696 217.051 233.372 237.419 230.283 256.163" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M247.257 198.294C247.517 219.421 248.118 240.78 248.922 261.727" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M276.09 198.294C276.696 219.814 273.428 241.434 273.428 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M293.809 199.407C293.414 220.715 294.808 241.656 294.808 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M321 199.407C320.7 220.28 317.879 247.076 319.001 265.066" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M335.889 198.294C335.35 223.471 333.895 248.801 333.895 273.968" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M196.897 203.278C200.248 202.326 202.739 203.495 205.8 203.695" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M233.622 204.357C235.117 204.099 236.593 203.716 238.074 203.36" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M281.475 207.848C282.979 208.035 284.481 207.724 285.926 207.601" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M81.1602 206.21C82.665 206.459 84.1663 206.045 85.6116 205.88" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M324.876 210.348C328.342 210.461 331.54 209.814 334.892 209.597" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M66.3377 155.388C65.7708 151.515 66.1155 142.825 71.4265 141.899C79.0689 140.57 79.2956 146.18 84.6566 147.58C96.4535 150.665 174.292 120.705 191.564 116.358C198.101 114.713 198.198 105 210.563 105C216.167 105 212.949 113.528 227.103 117.659C231.821 119.036 321.537 144.755 335.638 153.613C336.375 154.076 342.097 149.711 345.475 149.711C349.585 149.711 348.061 156.318 347.512 158.231" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M76.0438 157.18C104.715 153.671 133.644 159.144 162.337 159.144C180.702 159.144 198.876 159.796 217.155 160.78C232.592 161.607 248.391 161.432 263.851 161.432C268.61 161.432 339.399 162.499 339.992 163.068C341.344 164.376 339.313 175.156 339.313 177.794C339.313 179.238 340.507 188.39 339.992 188.592C312.456 199.24 148.067 185.975 105.484 185.975C100.317 185.975 74.2748 188.841 70.6327 185.319C69.0401 183.779 71.0896 170.186 71.3068 167.649" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M50.2399 286.757C48.2835 274.679 58.8072 280.438 63.1705 276.216C65.5956 273.869 62.1377 268.977 67.2563 267.326C86.9115 260.985 201.221 265.128 226.173 267.326C254.045 269.778 307.073 262.416 331.665 271.936C337.53 274.208 346.898 286.066 349.359 294" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M78.54 192.729C75.2103 217.791 74.8765 243.244 74.8765 268.404" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M95.6602 189.392C95.5489 213.231 94.1778 238.972 96.1587 261.728" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M120.11 189.392C117.82 213.426 116.295 237.66 114.546 261.728" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M133.018 191.617C130.467 214.481 133.489 242.725 131.687 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M156.834 190.504C155.165 212.389 152.383 234.105 152.383 256.163" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M170.257 192.729C170.227 216.032 169.009 239.434 169.009 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M127.227 203.675C125.215 203.711 123.631 203.17 121.895 202.927" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M164.531 202.629C161.934 203.12 160.138 201.951 158.041 201.631" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M194.64 190.504C194.317 211.918 192.477 233.888 192.477 255.05" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M211.075 192.729C210.357 215.678 210.574 238.717 210.574 261.727" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M234.735 198.294C230.696 217.051 233.372 237.419 230.283 256.163" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M247.257 198.294C247.517 219.421 248.118 240.78 248.922 261.727" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M276.09 198.294C276.696 219.814 273.428 241.434 273.428 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M293.809 199.407C293.414 220.715 294.808 241.656 294.808 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M321 199.407C320.7 220.28 317.879 247.076 319.001 265.066" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M335.889 198.294C335.35 223.471 333.895 248.801 333.895 273.968" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M196.897 203.278C200.248 202.326 202.739 203.495 205.8 203.695" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M233.622 204.357C235.117 204.099 236.593 203.716 238.074 203.36" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M281.475 207.848C282.979 208.035 284.481 207.724 285.926 207.601" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M81.1602 206.21C82.665 206.459 84.1663 206.045 85.6116 205.88" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M324.876 210.348C328.342 210.461 331.54 209.814 334.892 209.597" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
</svg>
.. autoclass:: TwitterEmbed
:members:
-.. TODO: JSON Issue
-.. .. autoclass:: JsonParser
-.. :members:
+.. autoclass:: JsonParser
+ :members:
Export
.. autofunction:: clean_tweet_url
.. autofunction:: clean_wayback_machine_url
.. autofunction:: delete_tweet_pathnames
+.. autofunction:: is_tweet_url
.. autofunction:: get_response
.. autofunction:: parse_date
.. autofunction:: semicolon_parser
- Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment):
http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix
+
+
+URL Match Scope
+-----------------
+
+The CDX Server can return results matching a certain prefix, a certain host or all subdomains by using the ``matchType`` param.
+
+For example, with the value ``prefix`` it is possible to retrieve URLs beyond `/status/`.
+
+Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
+
+.. note::
+
+ For example, if given the url: archive.org/about/ and:
+
+ - ``matchType=exact`` (default if omitted) will return results matching exactly archive.org/about/
+
+ - ``matchType=prefix`` will return results for all results under the path archive.org/about/
+
+ http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=prefix&limit=1000
+
+ - ``matchType=host`` will return results from host archive.org
+
+ http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=host&limit=1000
+
+ - ``matchType=domain`` will return results from host archive.org and all subhosts \*.archive.org
+
+ http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=domain&limit=1000
+
+ The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url:
+
+ - If url is ends in '/*', eg url=archive.org/* the query is equivalent to url=archive.org/&matchType=prefix
+ - If url starts with '*.', eg url=*.archive.org/ the query is equivalent to url=archive.org/&matchType=domain
+
+ (Note: The domain mode is only available if the CDX is in `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_-order format.)
The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
-.. TODO: JSON Issue
-.. This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
-.. The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
+This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
+
+The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
HTTPError
----------------
timestamp_to = parse_date("20191231")
limit = 250
offset = 0
+ matchtype = "exact"
- api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset)
+ api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype)
archived_tweets = api.get()
Filters
----------
+- Filtering by date range: Using the `from` and `to` filters
-- Calendar: Filtering by date range
+- Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the `flowchart <workflow.html>`_)
-- Checkbox: Only tweets not available
-
-- Checkbox: Only unique URLs (filtering by the collapse option using the urlkey field)
+- Only unique URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
Pagination
------------
-Pagination is automatic and allows viewing up to 25 tweets per page. This is a fixed value due to the API rate limit.
+Pagination allows viewing up to 25 tweets per page. This helps to avoid rate limiting from the API, for parsing returns with the mimetype ``application/json``.
Community Comments
clean_tweet_url,
delete_tweet_pathnames,
get_response,
+ is_tweet_url,
semicolon_parser,
)
# TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
class JsonParser:
"""
- Class responsible for parsing tweets when the mimetype is application/json.
+ Class responsible for parsing tweets when the mimetype is application/json.\n
+ Note: This class is in an experimental phase, but it is currently being
+ used by the Streamlit Web App.
:param archived_tweet_url: The URL of the archived tweet to be parsed.
"""
encoded_parsed_tweet = semicolon_parser(original_tweet)
encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
- embed_parser = TwitterEmbed(encoded_tweet)
- content = embed_parser.embed()
+ available_tweet_text = None
+ available_tweet_is_RT = None
+ available_tweet_info = None
- if content:
- self._add_field("available_tweet_text", semicolon_parser(content[0][0]))
- self._add_field("available_tweet_is_RT", content[1][0])
- self._add_field("available_tweet_info", semicolon_parser(content[2][0]))
+ is_tweet = is_tweet_url(encoded_tweet)
+
+ if is_tweet:
+ embed_parser = TwitterEmbed(encoded_tweet)
+ content = embed_parser.embed()
+
+ if content:
+ available_tweet_text = semicolon_parser(content[0][0])
+ available_tweet_is_RT = content[1][0]
+ available_tweet_info = semicolon_parser(content[2][0])
+
+ self._add_field("available_tweet_text", available_tweet_text)
+ self._add_field("available_tweet_is_RT", available_tweet_is_RT)
+ self._add_field("available_tweet_info", available_tweet_info)
# TODO: JSON Issue
# parsed_text_json = ""
:param timestamp_to: The timestamp to stop retrieving tweets at.
:param limit: The maximum number of results to return.
:param offset: The number of lines to skip in the results.
+ :param matchType: Results matching a certain prefix, a certain host or all subdomains. # noqa: E501
"""
def __init__(
timestamp_to: str,
limit: int,
offset: int,
+ matchtype: str,
):
self.username = username
self.collapse = collapse
self.timestamp_to = timestamp_to
self.limit = limit
self.offset = offset
+ self.matchtype = matchtype
def get(self) -> Optional[Dict[str, Any]]:
"""
:returns: The response from the CDX API in JSON format, if successful.
"""
url = "https://web.archive.org/cdx/search/cdx"
+
+ status = "/status/*"
+ if self.matchtype != "exact":
+ status = ""
+
params = {
- "url": f"https://twitter.com/{self.username}/status/*",
+ "url": f"https://twitter.com/{self.username}{status}",
"output": "json",
}
if self.offset:
params["offset"] = self.offset
+ if self.matchtype:
+ params["matchType"] = self.matchtype
+
try:
response = get_response(url=url, params=params)
rprint("[red]Connection to web.archive.org timed out.")
except exceptions.ConnectionError:
rprint(
- "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded." # noqa: E501
+ "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
)
except exceptions.HTTPError:
rprint(
default=None,
help="Filtering by date range up to this date. Format: YYYYmmdd",
)
-@click.option("--limit", type=int, default=None, help="Query result limits.")
+@click.option(
+ "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits."
+)
@click.option(
"--offset",
type=int,
+ metavar="INTEGER",
default=None,
help="Allows for a simple way to scroll through the results.",
)
+@click.option(
+ "--matchtype",
+ type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
+ default=None,
+ help="Results matching a certain prefix, a certain host or all subdomains. Default: exact", # noqa: E501
+)
def cli(
username: str,
collapse: Optional[str],
timestamp_to: Optional[str],
limit: Optional[int],
offset: Optional[int],
+ matchtype: Optional[str],
) -> None:
"""
Retrieves archived tweets CDX data from the Wayback Machine,
"""
try:
api = WaybackTweets(
- username, collapse, timestamp_from, timestamp_to, limit, offset
+ username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
)
print("Making a request to the Internet Archive...")
"archived_tweet_url",
"parsed_tweet_url",
"parsed_archived_tweet_url",
- # "parsed_tweet_text_mimetype_json", # TODO: JSON Issue
"available_tweet_text",
"available_tweet_is_RT",
"available_tweet_info",
return date.strftime("%Y%m%d")
except ValueError:
raise click.BadParameter("Date must be in format YYYYmmdd")
+
+
+def is_tweet_url(twitter_url: str) -> bool:
+ """
+ Checks if the provided URL is a Twitter status URL.
+
+ This function checks if the provided URL contains "/status/" exactly once,
+ which is a common pattern in Twitter status URLs.
+
+ :param twitter_url: The URL to check.
+
+ :returns: True if the URL is a Twitter status URL, False otherwise.
+ """
+ if twitter_url.count("/status/") == 1:
+ return True
+
+ return False