From a91c4605f30efd17e1b3789d29d2648aa6b030a8 Mon Sep 17 00:00:00 2001 From: Claromes Date: Sat, 15 Jun 2024 02:41:55 -0300 Subject: [PATCH] add matchtype option, review tweet parser, review docs --- .streamlit/config.toml | 13 ++ README.md | 5 +- app/.streamlit/config.toml | 11 - app/app.py | 320 +++++++++++++++------------- app/assets/parthenon.svg | 48 ++--- docs/api.rst | 6 +- docs/cli.rst | 35 +++ docs/errors.rst | 6 +- docs/quickstart.rst | 3 +- docs/streamlit.rst | 9 +- waybacktweets/api/parse_tweets.py | 28 ++- waybacktweets/api/request_tweets.py | 15 +- waybacktweets/cli/main.py | 15 +- waybacktweets/utils/utils.py | 17 ++ 14 files changed, 322 insertions(+), 209 deletions(-) create mode 100644 .streamlit/config.toml delete mode 100644 app/.streamlit/config.toml diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000..3b1df1a --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,13 @@ +[theme] +base = "light" +primaryColor = "black" +secondaryBackgroundColor = "gainsboro" +textColor = "black" +backgroundColor = "whitesmoke" +font = "sans serif" + +[client] +toolbarMode = "minimal" + +[server] +port = 8501 diff --git a/README.md b/README.md index 934f755..935553b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Wayback Tweets -[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) +[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://waybacktweets.streamlit.app) Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats. @@ -32,8 +32,9 @@ timestamp_from = parse_date("20150101") timestamp_to = parse_date("20191231") limit = 250 offset = 0 +matchtype = "exact" -api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset) +api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype) archived_tweets = api.get() ``` diff --git a/app/.streamlit/config.toml b/app/.streamlit/config.toml deleted file mode 100644 index cefb509..0000000 --- a/app/.streamlit/config.toml +++ /dev/null @@ -1,11 +0,0 @@ -[theme] -base = "light" -primaryColor = "#ef5552" -secondaryBackgroundColor = "#efefef" -textColor = "#000000" -backgroundColor = "#f9f9f9" -font = "serif" - -[client] -displayEnabled = true -toolbarMode = "minimal" diff --git a/app/app.py b/app/app.py index 5a8d8c9..ac2683f 100644 --- a/app/app.py +++ b/app/app.py @@ -5,9 +5,13 @@ import streamlit as st import streamlit.components.v1 as components from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import TweetsParser +from waybacktweets.api.parse_tweets import JsonParser, TweetsParser from waybacktweets.api.request_tweets import WaybackTweets -from waybacktweets.utils.utils import check_double_status, get_response +from waybacktweets.utils.utils import ( + check_double_status, + get_response, + semicolon_parser, +) # Initial Settings @@ -18,6 +22,17 @@ st.set_page_config( page_icon=LOGO, layout="centered", menu_items={ + "About": f""" + [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets) + + Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually. + + The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html). + + © Copyright 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library + + --- +""", # noqa: E501 "Report a bug": "https://github.com/claromes/waybacktweets/issues", }, ) @@ -37,7 +52,7 @@ st.html( visibility: hidden; } img[data-testid="stLogo"] { - scale: 3; + scale: 4; padding-left: 10px; } @@ -137,12 +152,15 @@ def tweets_count(username, archived_timestamp_filter): st.logo(LOGO) st.success( - """**New Feature: CLI** + """**v1.0 🎉: CLI and Python Module** + +$ `pip install waybacktweets` + +$ `waybacktweets --from 20150101 --to 20191231 --limit 250 jack` -You can now retrieve archived tweets using the Wayback Tweets command line tool. -Download the archived tweets CDX data in CSV, JSON, and HTML formats. +Retrieve archived tweets CDX data in CSV, JSON, and HTML formats using the command line. -For more details, [read the documentation](https://claromes.github.io/waybacktweets).""" # noqa: E501 +Read the documentation: [claromes.github.io/waybacktweets](https://claromes.github.io/waybacktweets).""" # noqa: E501 ) st.title( @@ -152,7 +170,10 @@ st.title( st.caption( "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)" # noqa: E501 ) -st.caption("Display multiple archived tweets on Wayback Machine") +st.caption("Display multiple archived tweets on Wayback Machine.") +st.caption( + "Download data via command line with the [`waybacktweets`](https://pypi.org/project/waybacktweets) Python package." # noqa: E501 +) username = st.text_input("Username", placeholder="Without @") @@ -165,21 +186,23 @@ st.session_state.archived_timestamp_filter = st.date_input( start_date, end_date, format="YYYY/MM/DD", - help="YYYY/MM/DD", + help="Using the `from` and `to` filters. Format: YYYY/MM/DD", ) -not_available = st.checkbox("Only tweets not available") +not_available = st.checkbox( + "Only tweets not available", + help="Checks if the archived URL still exists on Twitter", +) unique = st.checkbox( "Only unique URLs", - help="Filtering by the collapse option using the urlkey field", + help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501 ) query = st.button("Query", type="primary", use_container_width=True) # Tweet Listing Settings - if username != st.session_state.current_username: st.session_state.current_username = username st.session_state.offset = 0 @@ -191,35 +214,38 @@ if query or st.session_state.count: username, st.session_state.archived_timestamp_filter ) - st.caption( - "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 - ) - st.write(f"**{st.session_state.count} URLs have been captured**") + # st.caption( + # "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501 + # ) + # st.write(f"**{st.session_state.count} URLs have been captured**") if st.session_state.count: if tweets_per_page > st.session_state.count: tweets_per_page = st.session_state.count try: - progress = st.empty() - # Tweet Listing Processing + progress = st.empty() + collapse = None + matchType = None if unique: collapse = "urlkey" - - response = WaybackTweets( - username, - collapse, - st.session_state.archived_timestamp_filter[0], - st.session_state.archived_timestamp_filter[1], - tweets_per_page, - st.session_state.offset, - ) - archived_tweets = response.get() + matchType = "prefix" with st.spinner("Parsing..."): + response = WaybackTweets( + username, + collapse, + st.session_state.archived_timestamp_filter[0], + st.session_state.archived_timestamp_filter[1], + tweets_per_page, + st.session_state.offset, + matchType, + ) + archived_tweets = response.get() + if archived_tweets: field_options = [ "archived_urlkey", @@ -228,7 +254,6 @@ if query or st.session_state.count: "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", - "parsed_tweet_text_mimetype_json", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", @@ -242,17 +267,12 @@ if query or st.session_state.count: exporter = TweetsExporter(parsed_tweets, username, field_options) df = exporter.dataframe - # file_path = "claromes_tweets_20240610210338.csv" - # df = pd.read_csv(file_path) - # df = df.fillna("") - archived_urlkey = df["archived_urlkey"] archived_timestamp = df["archived_timestamp"] original_tweet_url = df["original_tweet_url"] archived_tweet_url = df["archived_tweet_url"] parsed_tweet_url = df["parsed_tweet_url"] parsed_archived_tweet_url = df["parsed_archived_tweet_url"] - parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"] available_tweet_text = df["available_tweet_text"] available_tweet_is_RT = df["available_tweet_is_RT"] available_tweet_info = df["available_tweet_info"] @@ -260,7 +280,6 @@ if query or st.session_state.count: archived_statuscode = df["archived_statuscode"] st.divider() - st.session_state.current_username = username return_none_count = 0 @@ -270,139 +289,142 @@ if query or st.session_state.count: for i in range(tweets_per_page): try: - if original_tweet_url[i]: + if archived_mimetype[i] == "application/json": + json_parser = JsonParser(parsed_archived_tweet_url[i]) + text_json = json_parser.parse() - # Display all tweets - if not not_available: + if text_json: + parsed_text_json = semicolon_parser(text_json) + + # Display all tweets + if not not_available: + # Display available tweets + if available_tweet_text[i]: st.markdown( - f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 ) - # Display available tweets - if available_tweet_text[i]: - if available_tweet_is_RT[i]: - st.info("*Retweet*") - - st.write(available_tweet_text[i]) - st.write(f"**{available_tweet_info[i]}**") + if available_tweet_is_RT[i]: + st.info("*Retweet*") - st.divider() + st.write(available_tweet_text[i]) + st.write(f"**{available_tweet_info[i]}**") - # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501 - elif ( - ( - archived_mimetype[i] != "application/json" - and not parsed_tweet_text_mimetype_json[i] + st.divider() + # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501 + elif ( + archived_mimetype[i] != "application/json" + and not available_tweet_text[i] + ): + st.markdown( + f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + ) + if ( + ".jpg" in original_tweet_url[i] + or ".png" in original_tweet_url[i] + ) and (400 <= archived_statuscode[i] <= 511): + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, ) - and not available_tweet_text[i] - ) or ( - ( - archived_mimetype[i] == "application/json" - and not parsed_tweet_text_mimetype_json[i] + elif "/status/" not in original_tweet_url[i]: + st.info( + "This isn't a status or is not available" # noqa: E501 ) - and not available_tweet_text[i] - ): - if ( - ".jpg" in original_tweet_url[i] - or ".png" in original_tweet_url[i] - ) and (400 <= archived_statuscode[i] <= 511): - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - elif "/status/" not in original_tweet_url[i]: - st.info( - "This isn't a status or is not available" - ) - elif ( - check_double_status( - archived_tweet_url[i], original_tweet_url[i] - ) - or f"{st.session_state.current_username}" - not in original_tweet_url[i] - ): - st.info( - f"Replying to {st.session_state.current_username}" # noqa: E501 - ) - else: - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - - st.divider() - - # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501 elif ( - archived_mimetype[i] == "application/json" - and parsed_tweet_text_mimetype_json[i] - ) and not available_tweet_text[i]: - st.code(parsed_tweet_text_mimetype_json[i]) - # st.json(json_data, expanded=False) + check_double_status( + archived_tweet_url[i], + original_tweet_url[i], + ) + or f"{st.session_state.current_username}" + not in original_tweet_url[i] + ): + st.info( + f"Replying to {st.session_state.current_username}" # noqa: E501 + ) + else: + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + + st.divider() + # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501 + elif ( + archived_mimetype[i] == "application/json" + and not available_tweet_text[i] + ): + st.markdown( + f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + ) + st.code(parsed_text_json) - st.divider() + st.divider() - # Display only tweets not available - if not_available: + # Display only tweets not available + if not_available: + # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 + if ( + archived_mimetype[i] != "application/json" + and not available_tweet_text[i] + ): return_none_count += 1 st.markdown( - f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 + f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 ) - - # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501 if ( - archived_mimetype[i] != "application/json" - and not available_tweet_text[i] - ): - if ( - ".jpg" in original_tweet_url[i] - or ".png" in original_tweet_url[i] - ) and (400 <= archived_statuscode[i] <= 511): - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - elif "/status/" not in original_tweet_url[i]: - st.info( - "This isn't a status or is not available" - ) - elif ( - check_double_status( - archived_tweet_url[i], original_tweet_url[i] - ) - or f"{st.session_state.current_username}" - not in original_tweet_url[i] - ): - st.info( - f"Replying to {st.session_state.current_username}" # noqa: E501 - ) - else: - components.iframe( - archived_tweet_url[i], - height=500, - scrolling=True, - ) - - st.divider() - - # Display tweets not available with application/json return # noqa: E501 + ".jpg" in original_tweet_url[i] + or ".png" in original_tweet_url[i] + ) and (400 <= archived_statuscode[i] <= 511): + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) + elif "/status/" not in original_tweet_url[i]: + st.info( + "This isn't a status or is not available" # noqa: E501 + ) elif ( - archived_mimetype[i] == "application/json" - and not available_tweet_text[i] + check_double_status( + archived_tweet_url[i], + original_tweet_url[i], + ) + or f"{st.session_state.current_username}" + not in original_tweet_url[i] ): - st.code(parsed_tweet_text_mimetype_json[i]) - # st.json(json_data, expanded=False) + st.info( + f"Replying to {st.session_state.current_username}" # noqa: E501 + ) + else: + components.iframe( + archived_tweet_url[i], + height=500, + scrolling=True, + ) - st.divider() + st.divider() - progress.write( - f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501 + # Display tweets not available with application/json return # noqa: E501 + elif ( + archived_mimetype[i] == "application/json" + and not available_tweet_text[i] + ): + return_none_count += 1 + + st.markdown( + f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}' # noqa: E501 ) - pass + st.code(parsed_text_json) + + st.divider() + + progress.write( + f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501 + ) except IndexError: if start_index <= 0: st.session_state.prev_disabled = True @@ -431,7 +453,9 @@ if query or st.session_state.count: ) if not archived_tweets: - st.error("Unable to query the Wayback Machine API.") + st.error( + "Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501 + ) except TypeError as e: st.error( f""" diff --git a/app/assets/parthenon.svg b/app/assets/parthenon.svg index babc09e..3be2d40 100644 --- a/app/assets/parthenon.svg +++ b/app/assets/parthenon.svg @@ -1,26 +1,26 @@ - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/api.rst b/docs/api.rst index ced6bf4..820cbd8 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -22,9 +22,8 @@ Parse .. autoclass:: TwitterEmbed :members: -.. TODO: JSON Issue -.. .. autoclass:: JsonParser -.. :members: +.. autoclass:: JsonParser + :members: Export @@ -55,6 +54,7 @@ Utils .. autofunction:: clean_tweet_url .. autofunction:: clean_wayback_machine_url .. autofunction:: delete_tweet_pathnames +.. autofunction:: is_tweet_url .. autofunction:: get_response .. autofunction:: parse_date .. autofunction:: semicolon_parser diff --git a/docs/cli.rst b/docs/cli.rst index 49abef2..f64b299 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -40,3 +40,38 @@ However, it is possible to use it with other options. Read below text extracted - Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment): http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix + + +URL Match Scope +----------------- + +The CDX Server can return results matching a certain prefix, a certain host or all subdomains by using the ``matchType`` param. + +For example, with the value ``prefix`` it is possible to retrieve URLs beyond `/status/`. + +Read below text extracted from the official Wayback CDX Server API (Beta) documentation. + +.. note:: + + For example, if given the url: archive.org/about/ and: + + - ``matchType=exact`` (default if omitted) will return results matching exactly archive.org/about/ + + - ``matchType=prefix`` will return results for all results under the path archive.org/about/ + + http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=prefix&limit=1000 + + - ``matchType=host`` will return results from host archive.org + + http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=host&limit=1000 + + - ``matchType=domain`` will return results from host archive.org and all subhosts \*.archive.org + + http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=domain&limit=1000 + + The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url: + + - If url is ends in '/*', eg url=archive.org/* the query is equivalent to url=archive.org/&matchType=prefix + - If url starts with '*.', eg url=*.archive.org/ the query is equivalent to url=archive.org/&matchType=domain + + (Note: The domain mode is only available if the CDX is in `SURT `_-order format.) diff --git a/docs/errors.rst b/docs/errors.rst index d12c436..38a8f1b 100644 --- a/docs/errors.rst +++ b/docs/errors.rst @@ -17,10 +17,10 @@ This error is raised when the package fails to establish a new connection with w The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.`` -.. TODO: JSON Issue -.. This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``. -.. The warning output message from the package would be: ``Connection error with https://web.archive.org/web//https://twitter.com//status/. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.`` +This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``. + +The warning output message from the package would be: ``Connection error with https://web.archive.org/web//https://twitter.com//status/. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.`` HTTPError ---------------- diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 916996b..8700ee3 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -29,8 +29,9 @@ Using Wayback Tweets as a Python Module timestamp_to = parse_date("20191231") limit = 250 offset = 0 + matchtype = "exact" - api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset) + api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype) archived_tweets = api.get() diff --git a/docs/streamlit.rst b/docs/streamlit.rst index 5fc21cd..78da866 100644 --- a/docs/streamlit.rst +++ b/docs/streamlit.rst @@ -6,18 +6,17 @@ Aplication that displays multiple archived tweets on Wayback Machine to avoid op Filters ---------- +- Filtering by date range: Using the `from` and `to` filters -- Calendar: Filtering by date range +- Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the `flowchart `_) -- Checkbox: Only tweets not available - -- Checkbox: Only unique URLs (filtering by the collapse option using the urlkey field) +- Only unique URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix`` Pagination ------------ -Pagination is automatic and allows viewing up to 25 tweets per page. This is a fixed value due to the API rate limit. +Pagination allows viewing up to 25 tweets per page. This helps to avoid rate limiting from the API, for parsing returns with the mimetype ``application/json``. Community Comments diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py index 3912590..e42f6c5 100644 --- a/waybacktweets/api/parse_tweets.py +++ b/waybacktweets/api/parse_tweets.py @@ -14,6 +14,7 @@ from waybacktweets.utils.utils import ( clean_tweet_url, delete_tweet_pathnames, get_response, + is_tweet_url, semicolon_parser, ) @@ -95,7 +96,9 @@ class TwitterEmbed: # TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501 class JsonParser: """ - Class responsible for parsing tweets when the mimetype is application/json. + Class responsible for parsing tweets when the mimetype is application/json.\n + Note: This class is in an experimental phase, but it is currently being + used by the Streamlit Web App. :param archived_tweet_url: The URL of the archived tweet to be parsed. """ @@ -201,13 +204,24 @@ class TweetsParser: encoded_parsed_tweet = semicolon_parser(original_tweet) encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url) - embed_parser = TwitterEmbed(encoded_tweet) - content = embed_parser.embed() + available_tweet_text = None + available_tweet_is_RT = None + available_tweet_info = None - if content: - self._add_field("available_tweet_text", semicolon_parser(content[0][0])) - self._add_field("available_tweet_is_RT", content[1][0]) - self._add_field("available_tweet_info", semicolon_parser(content[2][0])) + is_tweet = is_tweet_url(encoded_tweet) + + if is_tweet: + embed_parser = TwitterEmbed(encoded_tweet) + content = embed_parser.embed() + + if content: + available_tweet_text = semicolon_parser(content[0][0]) + available_tweet_is_RT = content[1][0] + available_tweet_info = semicolon_parser(content[2][0]) + + self._add_field("available_tweet_text", available_tweet_text) + self._add_field("available_tweet_is_RT", available_tweet_is_RT) + self._add_field("available_tweet_info", available_tweet_info) # TODO: JSON Issue # parsed_text_json = "" diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py index 07e9903..83f0405 100644 --- a/waybacktweets/api/request_tweets.py +++ b/waybacktweets/api/request_tweets.py @@ -16,6 +16,7 @@ class WaybackTweets: :param timestamp_to: The timestamp to stop retrieving tweets at. :param limit: The maximum number of results to return. :param offset: The number of lines to skip in the results. + :param matchType: Results matching a certain prefix, a certain host or all subdomains. # noqa: E501 """ def __init__( @@ -26,6 +27,7 @@ class WaybackTweets: timestamp_to: str, limit: int, offset: int, + matchtype: str, ): self.username = username self.collapse = collapse @@ -33,6 +35,7 @@ class WaybackTweets: self.timestamp_to = timestamp_to self.limit = limit self.offset = offset + self.matchtype = matchtype def get(self) -> Optional[Dict[str, Any]]: """ @@ -42,8 +45,13 @@ class WaybackTweets: :returns: The response from the CDX API in JSON format, if successful. """ url = "https://web.archive.org/cdx/search/cdx" + + status = "/status/*" + if self.matchtype != "exact": + status = "" + params = { - "url": f"https://twitter.com/{self.username}/status/*", + "url": f"https://twitter.com/{self.username}{status}", "output": "json", } @@ -62,6 +70,9 @@ class WaybackTweets: if self.offset: params["offset"] = self.offset + if self.matchtype: + params["matchType"] = self.matchtype + try: response = get_response(url=url, params=params) @@ -71,7 +82,7 @@ class WaybackTweets: rprint("[red]Connection to web.archive.org timed out.") except exceptions.ConnectionError: rprint( - "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded." # noqa: E501 + "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501 ) except exceptions.HTTPError: rprint( diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py index 96509c5..987bac2 100644 --- a/waybacktweets/cli/main.py +++ b/waybacktweets/cli/main.py @@ -40,13 +40,22 @@ from waybacktweets.utils.utils import parse_date default=None, help="Filtering by date range up to this date. Format: YYYYmmdd", ) -@click.option("--limit", type=int, default=None, help="Query result limits.") +@click.option( + "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits." +) @click.option( "--offset", type=int, + metavar="INTEGER", default=None, help="Allows for a simple way to scroll through the results.", ) +@click.option( + "--matchtype", + type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False), + default=None, + help="Results matching a certain prefix, a certain host or all subdomains. Default: exact", # noqa: E501 +) def cli( username: str, collapse: Optional[str], @@ -54,6 +63,7 @@ def cli( timestamp_to: Optional[str], limit: Optional[int], offset: Optional[int], + matchtype: Optional[str], ) -> None: """ Retrieves archived tweets CDX data from the Wayback Machine, @@ -63,7 +73,7 @@ def cli( """ try: api = WaybackTweets( - username, collapse, timestamp_from, timestamp_to, limit, offset + username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype ) print("Making a request to the Internet Archive...") @@ -77,7 +87,6 @@ def cli( "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", - # "parsed_tweet_text_mimetype_json", # TODO: JSON Issue "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py index 50ea728..5163b33 100644 --- a/waybacktweets/utils/utils.py +++ b/waybacktweets/utils/utils.py @@ -179,3 +179,20 @@ def parse_date( return date.strftime("%Y%m%d") except ValueError: raise click.BadParameter("Date must be in format YYYYmmdd") + + +def is_tweet_url(twitter_url: str) -> bool: + """ + Checks if the provided URL is a Twitter status URL. + + This function checks if the provided URL contains "/status/" exactly once, + which is a common pattern in Twitter status URLs. + + :param twitter_url: The URL to check. + + :returns: True if the URL is a Twitter status URL, False otherwise. + """ + if twitter_url.count("/status/") == 1: + return True + + return False -- 2.34.1