add matchtype option, review tweet parser, review docs
authorClaromes <claromes@hey.com>
Sat, 15 Jun 2024 05:41:55 +0000 (02:41 -0300)
committerClaromes <claromes@hey.com>
Sat, 15 Jun 2024 05:41:55 +0000 (02:41 -0300)
14 files changed:
.streamlit/config.toml [new file with mode: 0644]
README.md
app/.streamlit/config.toml [deleted file]
app/app.py
app/assets/parthenon.svg
docs/api.rst
docs/cli.rst
docs/errors.rst
docs/quickstart.rst
docs/streamlit.rst
waybacktweets/api/parse_tweets.py
waybacktweets/api/request_tweets.py
waybacktweets/cli/main.py
waybacktweets/utils/utils.py

diff --git a/.streamlit/config.toml b/.streamlit/config.toml
new file mode 100644 (file)
index 0000000..3b1df1a
--- /dev/null
@@ -0,0 +1,13 @@
+[theme]
+base = "light"
+primaryColor = "black"
+secondaryBackgroundColor = "gainsboro"
+textColor = "black"
+backgroundColor = "whitesmoke"
+font = "sans serif"
+
+[client]
+toolbarMode = "minimal"
+
+[server]
+port = 8501
index 934f755135f6ec8551293b93220f2355126bb730..935553b3f67d42d61dccc6966c674ddd48047b4f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Wayback Tweets
 
-[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets)
+[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://waybacktweets.streamlit.app)
 
 Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats.
 
@@ -32,8 +32,9 @@ timestamp_from = parse_date("20150101")
 timestamp_to = parse_date("20191231")
 limit = 250
 offset = 0
+matchtype = "exact"
 
-api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset)
+api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype)
 
 archived_tweets = api.get()
 ```
diff --git a/app/.streamlit/config.toml b/app/.streamlit/config.toml
deleted file mode 100644 (file)
index cefb509..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-[theme]
-base = "light"
-primaryColor = "#ef5552"
-secondaryBackgroundColor = "#efefef"
-textColor = "#000000"
-backgroundColor = "#f9f9f9"
-font = "serif"
-
-[client]
-displayEnabled = true
-toolbarMode = "minimal"
index 5a8d8c9d13d6b169c41f4303f97ac215eaf6c21b..ac2683f8d8baa99fa7ab8f2f114a79521e5749aa 100644 (file)
@@ -5,9 +5,13 @@ import streamlit as st
 import streamlit.components.v1 as components
 
 from waybacktweets.api.export_tweets import TweetsExporter
-from waybacktweets.api.parse_tweets import TweetsParser
+from waybacktweets.api.parse_tweets import JsonParser, TweetsParser
 from waybacktweets.api.request_tweets import WaybackTweets
-from waybacktweets.utils.utils import check_double_status, get_response
+from waybacktweets.utils.utils import (
+    check_double_status,
+    get_response,
+    semicolon_parser,
+)
 
 # Initial Settings
 
@@ -18,6 +22,17 @@ st.set_page_config(
     page_icon=LOGO,
     layout="centered",
     menu_items={
+        "About": f"""
+    [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)
+
+    Aplication that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
+
+    The application is a prototype hosted on Streamlit Cloud, allowing users to apply filters and view tweets that lack the original URL. [Read more](https://claromes.github.io/waybacktweets/streamlit.html).
+
+    © Copyright 2023 - {datetime.datetime.now().year}, [Claromes](https://claromes.com) · Icon by The Doodle Library
+
+    ---
+""",  # noqa: E501
         "Report a bug": "https://github.com/claromes/waybacktweets/issues",
     },
 )
@@ -37,7 +52,7 @@ st.html(
         visibility: hidden;
     }
     img[data-testid="stLogo"] {
-        scale: 3;
+        scale: 4;
         padding-left: 10px;
     }
 </style>
@@ -137,12 +152,15 @@ def tweets_count(username, archived_timestamp_filter):
 st.logo(LOGO)
 
 st.success(
-    """**New Feature: CLI**
+    """**v1.0 🎉: CLI and Python Module**
+
+$ `pip install waybacktweets`
+
+$ `waybacktweets --from 20150101 --to 20191231 --limit 250 jack`
 
-You can now retrieve archived tweets using the Wayback Tweets command line tool.
-Download the archived tweets CDX data in CSV, JSON, and HTML formats.
+Retrieve archived tweets CDX data in CSV, JSON, and HTML formats using the command line.
 
-For more details, [read the documentation](https://claromes.github.io/waybacktweets)."""  # noqa: E501
+Read the documentation: [claromes.github.io/waybacktweets](https://claromes.github.io/waybacktweets)."""  # noqa: E501
 )
 
 st.title(
@@ -152,7 +170,10 @@ st.title(
 st.caption(
     "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)"  # noqa: E501
 )
-st.caption("Display multiple archived tweets on Wayback Machine")
+st.caption("Display multiple archived tweets on Wayback Machine.")
+st.caption(
+    "Download data via command line with the [`waybacktweets`](https://pypi.org/project/waybacktweets) Python package."  # noqa: E501
+)
 
 username = st.text_input("Username", placeholder="Without @")
 
@@ -165,21 +186,23 @@ st.session_state.archived_timestamp_filter = st.date_input(
     start_date,
     end_date,
     format="YYYY/MM/DD",
-    help="YYYY/MM/DD",
+    help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
 )
 
-not_available = st.checkbox("Only tweets not available")
+not_available = st.checkbox(
+    "Only tweets not available",
+    help="Checks if the archived URL still exists on Twitter",
+)
 
 unique = st.checkbox(
     "Only unique URLs",
-    help="Filtering by the collapse option using the urlkey field",
+    help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`",  # noqa: E501
 )
 
 query = st.button("Query", type="primary", use_container_width=True)
 
 # Tweet Listing Settings
 
-
 if username != st.session_state.current_username:
     st.session_state.current_username = username
     st.session_state.offset = 0
@@ -191,35 +214,38 @@ if query or st.session_state.count:
         username, st.session_state.archived_timestamp_filter
     )
 
-    st.caption(
-        "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
-    )
-    st.write(f"**{st.session_state.count} URLs have been captured**")
+    st.caption(
+        "The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit."  # noqa: E501
+    )
+    st.write(f"**{st.session_state.count} URLs have been captured**")
 
     if st.session_state.count:
         if tweets_per_page > st.session_state.count:
             tweets_per_page = st.session_state.count
 
     try:
-        progress = st.empty()
-
         # Tweet Listing Processing
 
+        progress = st.empty()
+
         collapse = None
+        matchType = None
         if unique:
             collapse = "urlkey"
-
-        response = WaybackTweets(
-            username,
-            collapse,
-            st.session_state.archived_timestamp_filter[0],
-            st.session_state.archived_timestamp_filter[1],
-            tweets_per_page,
-            st.session_state.offset,
-        )
-        archived_tweets = response.get()
+            matchType = "prefix"
 
         with st.spinner("Parsing..."):
+            response = WaybackTweets(
+                username,
+                collapse,
+                st.session_state.archived_timestamp_filter[0],
+                st.session_state.archived_timestamp_filter[1],
+                tweets_per_page,
+                st.session_state.offset,
+                matchType,
+            )
+            archived_tweets = response.get()
+
             if archived_tweets:
                 field_options = [
                     "archived_urlkey",
@@ -228,7 +254,6 @@ if query or st.session_state.count:
                     "archived_tweet_url",
                     "parsed_tweet_url",
                     "parsed_archived_tweet_url",
-                    "parsed_tweet_text_mimetype_json",
                     "available_tweet_text",
                     "available_tweet_is_RT",
                     "available_tweet_info",
@@ -242,17 +267,12 @@ if query or st.session_state.count:
                 exporter = TweetsExporter(parsed_tweets, username, field_options)
                 df = exporter.dataframe
 
-                # file_path = "claromes_tweets_20240610210338.csv"
-                # df = pd.read_csv(file_path)
-                # df = df.fillna("")
-
                 archived_urlkey = df["archived_urlkey"]
                 archived_timestamp = df["archived_timestamp"]
                 original_tweet_url = df["original_tweet_url"]
                 archived_tweet_url = df["archived_tweet_url"]
                 parsed_tweet_url = df["parsed_tweet_url"]
                 parsed_archived_tweet_url = df["parsed_archived_tweet_url"]
-                parsed_tweet_text_mimetype_json = df["parsed_tweet_text_mimetype_json"]
                 available_tweet_text = df["available_tweet_text"]
                 available_tweet_is_RT = df["available_tweet_is_RT"]
                 available_tweet_info = df["available_tweet_info"]
@@ -260,7 +280,6 @@ if query or st.session_state.count:
                 archived_statuscode = df["archived_statuscode"]
 
                 st.divider()
-
                 st.session_state.current_username = username
 
                 return_none_count = 0
@@ -270,139 +289,142 @@ if query or st.session_state.count:
 
                 for i in range(tweets_per_page):
                     try:
-                        if original_tweet_url[i]:
+                        if archived_mimetype[i] == "application/json":
+                            json_parser = JsonParser(parsed_archived_tweet_url[i])
+                            text_json = json_parser.parse()
 
-                            # Display all tweets
-                            if not not_available:
+                            if text_json:
+                                parsed_text_json = semicolon_parser(text_json)
+
+                        # Display all tweets
+                        if not not_available:
+                            # Display available tweets
+                            if available_tweet_text[i]:
                                 st.markdown(
-                                    f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
+                                    f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
                                 )
 
-                                # Display available tweets
-                                if available_tweet_text[i]:
-                                    if available_tweet_is_RT[i]:
-                                        st.info("*Retweet*")
-
-                                    st.write(available_tweet_text[i])
-                                    st.write(f"**{available_tweet_info[i]}**")
+                                if available_tweet_is_RT[i]:
+                                    st.info("*Retweet*")
 
-                                    st.divider()
+                                st.write(available_tweet_text[i])
+                                st.write(f"**{available_tweet_info[i]}**")
 
-                                # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501
-                                elif (
-                                    (
-                                        archived_mimetype[i] != "application/json"
-                                        and not parsed_tweet_text_mimetype_json[i]
+                                st.divider()
+                            # Display tweets not available with text/html, unk, warc/revisit mimetype or application/json mimetype without parsed JSON text # noqa: E501
+                            elif (
+                                archived_mimetype[i] != "application/json"
+                                and not available_tweet_text[i]
+                            ):
+                                st.markdown(
+                                    f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
+                                )
+                                if (
+                                    ".jpg" in original_tweet_url[i]
+                                    or ".png" in original_tweet_url[i]
+                                ) and (400 <= archived_statuscode[i] <= 511):
+                                    components.iframe(
+                                        archived_tweet_url[i],
+                                        height=500,
+                                        scrolling=True,
                                     )
-                                    and not available_tweet_text[i]
-                                ) or (
-                                    (
-                                        archived_mimetype[i] == "application/json"
-                                        and not parsed_tweet_text_mimetype_json[i]
+                                elif "/status/" not in original_tweet_url[i]:
+                                    st.info(
+                                        "This isn't a status or is not available"  # noqa: E501
                                     )
-                                    and not available_tweet_text[i]
-                                ):
-                                    if (
-                                        ".jpg" in original_tweet_url[i]
-                                        or ".png" in original_tweet_url[i]
-                                    ) and (400 <= archived_statuscode[i] <= 511):
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-                                    elif "/status/" not in original_tweet_url[i]:
-                                        st.info(
-                                            "This isn't a status or is not available"
-                                        )
-                                    elif (
-                                        check_double_status(
-                                            archived_tweet_url[i], original_tweet_url[i]
-                                        )
-                                        or f"{st.session_state.current_username}"
-                                        not in original_tweet_url[i]
-                                    ):
-                                        st.info(
-                                            f"Replying to {st.session_state.current_username}"  # noqa: E501
-                                        )
-                                    else:
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-
-                                    st.divider()
-
-                                # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501
                                 elif (
-                                    archived_mimetype[i] == "application/json"
-                                    and parsed_tweet_text_mimetype_json[i]
-                                ) and not available_tweet_text[i]:
-                                    st.code(parsed_tweet_text_mimetype_json[i])
-                                    # st.json(json_data, expanded=False)
+                                    check_double_status(
+                                        archived_tweet_url[i],
+                                        original_tweet_url[i],
+                                    )
+                                    or f"{st.session_state.current_username}"
+                                    not in original_tweet_url[i]
+                                ):
+                                    st.info(
+                                        f"Replying to {st.session_state.current_username}"  # noqa: E501
+                                    )
+                                else:
+                                    components.iframe(
+                                        archived_tweet_url[i],
+                                        height=500,
+                                        scrolling=True,
+                                    )
+
+                                st.divider()
+                            # Display tweets not available with application/json mimetype and parsed JSON text # noqa: E501
+                            elif (
+                                archived_mimetype[i] == "application/json"
+                                and not available_tweet_text[i]
+                            ):
+                                st.markdown(
+                                    f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
+                                )
+                                st.code(parsed_text_json)
 
-                                    st.divider()
+                                st.divider()
 
-                            # Display only tweets not available
-                            if not_available:
+                        # Display only tweets not available
+                        if not_available:
+                            # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
+                            if (
+                                archived_mimetype[i] != "application/json"
+                                and not available_tweet_text[i]
+                            ):
                                 return_none_count += 1
 
                                 st.markdown(
-                                    f'{i+1 + st.session_state.offset}. [**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
+                                    f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
                                 )
-
-                                # Display tweets not available with text/html, unk, warc/revisit return # noqa: E501
                                 if (
-                                    archived_mimetype[i] != "application/json"
-                                    and not available_tweet_text[i]
-                                ):
-                                    if (
-                                        ".jpg" in original_tweet_url[i]
-                                        or ".png" in original_tweet_url[i]
-                                    ) and (400 <= archived_statuscode[i] <= 511):
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-                                    elif "/status/" not in original_tweet_url[i]:
-                                        st.info(
-                                            "This isn't a status or is not available"
-                                        )
-                                    elif (
-                                        check_double_status(
-                                            archived_tweet_url[i], original_tweet_url[i]
-                                        )
-                                        or f"{st.session_state.current_username}"
-                                        not in original_tweet_url[i]
-                                    ):
-                                        st.info(
-                                            f"Replying to {st.session_state.current_username}"  # noqa: E501
-                                        )
-                                    else:
-                                        components.iframe(
-                                            archived_tweet_url[i],
-                                            height=500,
-                                            scrolling=True,
-                                        )
-
-                                    st.divider()
-
-                                # Display tweets not available with application/json return # noqa: E501
+                                    ".jpg" in original_tweet_url[i]
+                                    or ".png" in original_tweet_url[i]
+                                ) and (400 <= archived_statuscode[i] <= 511):
+                                    components.iframe(
+                                        archived_tweet_url[i],
+                                        height=500,
+                                        scrolling=True,
+                                    )
+                                elif "/status/" not in original_tweet_url[i]:
+                                    st.info(
+                                        "This isn't a status or is not available"  # noqa: E501
+                                    )
                                 elif (
-                                    archived_mimetype[i] == "application/json"
-                                    and not available_tweet_text[i]
+                                    check_double_status(
+                                        archived_tweet_url[i],
+                                        original_tweet_url[i],
+                                    )
+                                    or f"{st.session_state.current_username}"
+                                    not in original_tweet_url[i]
                                 ):
-                                    st.code(parsed_tweet_text_mimetype_json[i])
-                                    # st.json(json_data, expanded=False)
+                                    st.info(
+                                        f"Replying to {st.session_state.current_username}"  # noqa: E501
+                                    )
+                                else:
+                                    components.iframe(
+                                        archived_tweet_url[i],
+                                        height=500,
+                                        scrolling=True,
+                                    )
 
-                                    st.divider()
+                                st.divider()
 
-                                progress.write(
-                                    f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}"  # noqa: E501
+                            # Display tweets not available with application/json return # noqa: E501
+                            elif (
+                                archived_mimetype[i] == "application/json"
+                                and not available_tweet_text[i]
+                            ):
+                                return_none_count += 1
+
+                                st.markdown(
+                                    f'[**archived url ↗**]({archived_tweet_url[i]}) · [**tweet url ↗**]({original_tweet_url[i]}) · **mimetype:** {archived_mimetype[i]} · **archived timestamp:** {datetime.datetime.strptime(str(archived_timestamp[i]), "%Y%m%d%H%M%S")} · **archived status code:** {archived_statuscode[i]}'  # noqa: E501
                                 )
-                        pass
+                                st.code(parsed_text_json)
+
+                                st.divider()
+
+                            progress.write(
+                                f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}"  # noqa: E501
+                            )
                     except IndexError:
                         if start_index <= 0:
                             st.session_state.prev_disabled = True
@@ -431,7 +453,9 @@ if query or st.session_state.count:
             )
 
         if not archived_tweets:
-            st.error("Unable to query the Wayback Machine API.")
+            st.error(
+                "Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
+            )
     except TypeError as e:
         st.error(
             f"""
index babc09e23509b2c735e5958d9f017cf186142cca..3be2d40d29ab681a2021b878a57db7d7e4353432 100644 (file)
@@ -1,26 +1,26 @@
 <svg width="400" height="400" viewBox="0 0 400 400" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M66.3377 155.388C65.7708 151.515 66.1155 142.825 71.4265 141.899C79.0689 140.57 79.2956 146.18 84.6566 147.58C96.4535 150.665 174.292 120.705 191.564 116.358C198.101 114.713 198.198 105 210.563 105C216.167 105 212.949 113.528 227.103 117.659C231.821 119.036 321.537 144.755 335.638 153.613C336.375 154.076 342.097 149.711 345.475 149.711C349.585 149.711 348.061 156.318 347.512 158.231" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M76.0438 157.18C104.715 153.671 133.644 159.144 162.337 159.144C180.702 159.144 198.876 159.796 217.155 160.78C232.592 161.607 248.391 161.432 263.851 161.432C268.61 161.432 339.399 162.499 339.992 163.068C341.344 164.376 339.313 175.156 339.313 177.794C339.313 179.238 340.507 188.39 339.992 188.592C312.456 199.24 148.067 185.975 105.484 185.975C100.317 185.975 74.2748 188.841 70.6327 185.319C69.0401 183.779 71.0896 170.186 71.3068 167.649" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M50.2399 286.757C48.2835 274.679 58.8072 280.438 63.1705 276.216C65.5956 273.869 62.1377 268.977 67.2563 267.326C86.9115 260.985 201.221 265.128 226.173 267.326C254.045 269.778 307.073 262.416 331.665 271.936C337.53 274.208 346.898 286.066 349.359 294" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M78.54 192.729C75.2103 217.791 74.8765 243.244 74.8765 268.404" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M95.6602 189.392C95.5489 213.231 94.1778 238.972 96.1587 261.728" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M120.11 189.392C117.82 213.426 116.295 237.66 114.546 261.728" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M133.018 191.617C130.467 214.481 133.489 242.725 131.687 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M156.834 190.504C155.165 212.389 152.383 234.105 152.383 256.163" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M170.257 192.729C170.227 216.032 169.009 239.434 169.009 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M127.227 203.675C125.215 203.711 123.631 203.17 121.895 202.927" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M164.531 202.629C161.934 203.12 160.138 201.951 158.041 201.631" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M194.64 190.504C194.317 211.918 192.477 233.888 192.477 255.05" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M211.075 192.729C210.357 215.678 210.574 238.717 210.574 261.727" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M234.735 198.294C230.696 217.051 233.372 237.419 230.283 256.163" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M247.257 198.294C247.517 219.421 248.118 240.78 248.922 261.727" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M276.09 198.294C276.696 219.814 273.428 241.434 273.428 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M293.809 199.407C293.414 220.715 294.808 241.656 294.808 262.84" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M321 199.407C320.7 220.28 317.879 247.076 319.001 265.066" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M335.889 198.294C335.35 223.471 333.895 248.801 333.895 273.968" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M196.897 203.278C200.248 202.326 202.739 203.495 205.8 203.695" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M233.622 204.357C235.117 204.099 236.593 203.716 238.074 203.36" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M281.475 207.848C282.979 208.035 284.481 207.724 285.926 207.601" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M81.1602 206.21C82.665 206.459 84.1663 206.045 85.6116 205.88" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M324.876 210.348C328.342 210.461 331.54 209.814 334.892 209.597" stroke="#ef5552" stroke-opacity="0.9" stroke-width="16" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M66.3377 155.388C65.7708 151.515 66.1155 142.825 71.4265 141.899C79.0689 140.57 79.2956 146.18 84.6566 147.58C96.4535 150.665 174.292 120.705 191.564 116.358C198.101 114.713 198.198 105 210.563 105C216.167 105 212.949 113.528 227.103 117.659C231.821 119.036 321.537 144.755 335.638 153.613C336.375 154.076 342.097 149.711 345.475 149.711C349.585 149.711 348.061 156.318 347.512 158.231" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M76.0438 157.18C104.715 153.671 133.644 159.144 162.337 159.144C180.702 159.144 198.876 159.796 217.155 160.78C232.592 161.607 248.391 161.432 263.851 161.432C268.61 161.432 339.399 162.499 339.992 163.068C341.344 164.376 339.313 175.156 339.313 177.794C339.313 179.238 340.507 188.39 339.992 188.592C312.456 199.24 148.067 185.975 105.484 185.975C100.317 185.975 74.2748 188.841 70.6327 185.319C69.0401 183.779 71.0896 170.186 71.3068 167.649" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M50.2399 286.757C48.2835 274.679 58.8072 280.438 63.1705 276.216C65.5956 273.869 62.1377 268.977 67.2563 267.326C86.9115 260.985 201.221 265.128 226.173 267.326C254.045 269.778 307.073 262.416 331.665 271.936C337.53 274.208 346.898 286.066 349.359 294" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M78.54 192.729C75.2103 217.791 74.8765 243.244 74.8765 268.404" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M95.6602 189.392C95.5489 213.231 94.1778 238.972 96.1587 261.728" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M120.11 189.392C117.82 213.426 116.295 237.66 114.546 261.728" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M133.018 191.617C130.467 214.481 133.489 242.725 131.687 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M156.834 190.504C155.165 212.389 152.383 234.105 152.383 256.163" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M170.257 192.729C170.227 216.032 169.009 239.434 169.009 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M127.227 203.675C125.215 203.711 123.631 203.17 121.895 202.927" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M164.531 202.629C161.934 203.12 160.138 201.951 158.041 201.631" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M194.64 190.504C194.317 211.918 192.477 233.888 192.477 255.05" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M211.075 192.729C210.357 215.678 210.574 238.717 210.574 261.727" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M234.735 198.294C230.696 217.051 233.372 237.419 230.283 256.163" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M247.257 198.294C247.517 219.421 248.118 240.78 248.922 261.727" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M276.09 198.294C276.696 219.814 273.428 241.434 273.428 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M293.809 199.407C293.414 220.715 294.808 241.656 294.808 262.84" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M321 199.407C320.7 220.28 317.879 247.076 319.001 265.066" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M335.889 198.294C335.35 223.471 333.895 248.801 333.895 273.968" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M196.897 203.278C200.248 202.326 202.739 203.495 205.8 203.695" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M233.622 204.357C235.117 204.099 236.593 203.716 238.074 203.36" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M281.475 207.848C282.979 208.035 284.481 207.724 285.926 207.601" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M81.1602 206.21C82.665 206.459 84.1663 206.045 85.6116 205.88" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M324.876 210.348C328.342 210.461 331.54 209.814 334.892 209.597" stroke="black" stroke-opacity="0.9" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
 </svg>
index ced6bf4d97ba851bb16ff48e7c8c0e3859082a50..820cbd8b23aa0f9335c837fbeef2f06956c4781f 100644 (file)
@@ -22,9 +22,8 @@ Parse
 .. autoclass:: TwitterEmbed
     :members:
 
-.. TODO: JSON Issue
-.. .. autoclass:: JsonParser
-..     :members:
+.. autoclass:: JsonParser
+    :members:
 
 
 Export
@@ -55,6 +54,7 @@ Utils
 .. autofunction:: clean_tweet_url
 .. autofunction:: clean_wayback_machine_url
 .. autofunction:: delete_tweet_pathnames
+.. autofunction:: is_tweet_url
 .. autofunction:: get_response
 .. autofunction:: parse_date
 .. autofunction:: semicolon_parser
index 49abef2224516299d0a24e8bf3e33a42d1e21184..f64b299f71f2040e7c00ce1eef3773c3a8a1b20a 100644 (file)
@@ -40,3 +40,38 @@ However, it is possible to use it with other options. Read below text extracted
    - Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment):
 
       http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix
+
+
+URL Match Scope
+-----------------
+
+The CDX Server can return results matching a certain prefix, a certain host or all subdomains by using the ``matchType`` param.
+
+For example, with the value ``prefix`` it is possible to retrieve URLs beyond `/status/`.
+
+Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
+
+.. note::
+
+   For example, if given the url: archive.org/about/ and:
+
+   - ``matchType=exact`` (default if omitted) will return results matching exactly archive.org/about/
+
+   - ``matchType=prefix`` will return results for all results under the path archive.org/about/
+
+      http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=prefix&limit=1000
+
+   - ``matchType=host`` will return results from host archive.org
+
+      http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=host&limit=1000
+
+   - ``matchType=domain`` will return results from host archive.org and all subhosts \*.archive.org
+
+      http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=domain&limit=1000
+
+   The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url:
+
+   - If url is ends in '/*', eg url=archive.org/* the query is equivalent to url=archive.org/&matchType=prefix
+   - If url starts with '*.', eg url=*.archive.org/ the query is equivalent to url=archive.org/&matchType=domain
+
+   (Note: The domain mode is only available if the CDX is in `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_-order format.)
index d12c436d81e56eb5646158b84d639d22c85a1b3b..38a8f1b80697d7425ec28a963fae5a2f0ac3026e 100644 (file)
@@ -17,10 +17,10 @@ This error is raised when the package fails to establish a new connection with w
 
 The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
 
-.. TODO: JSON Issue
-.. This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
 
-.. The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
+This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
+
+The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
 
 HTTPError
 ----------------
index 916996beec4afeab5f378f5cc106f42350d98a65..8700ee362fdfaa9491b2e625edb34c8268679c15 100644 (file)
@@ -29,8 +29,9 @@ Using Wayback Tweets as a Python Module
     timestamp_to = parse_date("20191231")
     limit = 250
     offset = 0
+    matchtype = "exact"
 
-    api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset)
+    api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype)
 
     archived_tweets = api.get()
 
index 5fc21cd258fa7ea433836d7525178dd211b16413..78da8662272df4c7478109b796d268e55dfe4a22 100644 (file)
@@ -6,18 +6,17 @@ Aplication that displays multiple archived tweets on Wayback Machine to avoid op
 
 Filters
 ----------
+- Filtering by date range: Using the `from` and `to` filters
 
-- Calendar: Filtering by date range
+- Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the `flowchart <workflow.html>`_)
 
-- Checkbox: Only tweets not available
-
-- Checkbox: Only unique URLs (filtering by the collapse option using the urlkey field)
+- Only unique URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
 
 
 Pagination
 ------------
 
-Pagination is automatic and allows viewing up to 25 tweets per page. This is a fixed value due to the API rate limit.
+Pagination allows viewing up to 25 tweets per page. This helps to avoid rate limiting from the API, for parsing returns with the mimetype ``application/json``.
 
 
 Community Comments
index 39125905e571315842d8a3e389b1b797abac267c..e42f6c583f4912e1236c93c89e47fcac2cb7c0b6 100644 (file)
@@ -14,6 +14,7 @@ from waybacktweets.utils.utils import (
     clean_tweet_url,
     delete_tweet_pathnames,
     get_response,
+    is_tweet_url,
     semicolon_parser,
 )
 
@@ -95,7 +96,9 @@ class TwitterEmbed:
 # TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
 class JsonParser:
     """
-    Class responsible for parsing tweets when the mimetype is application/json.
+    Class responsible for parsing tweets when the mimetype is application/json.\n
+    Note: This class is in an experimental phase, but it is currently being
+    used by the Streamlit Web App.
 
     :param archived_tweet_url: The URL of the archived tweet to be parsed.
     """
@@ -201,13 +204,24 @@ class TweetsParser:
         encoded_parsed_tweet = semicolon_parser(original_tweet)
         encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
 
-        embed_parser = TwitterEmbed(encoded_tweet)
-        content = embed_parser.embed()
+        available_tweet_text = None
+        available_tweet_is_RT = None
+        available_tweet_info = None
 
-        if content:
-            self._add_field("available_tweet_text", semicolon_parser(content[0][0]))
-            self._add_field("available_tweet_is_RT", content[1][0])
-            self._add_field("available_tweet_info", semicolon_parser(content[2][0]))
+        is_tweet = is_tweet_url(encoded_tweet)
+
+        if is_tweet:
+            embed_parser = TwitterEmbed(encoded_tweet)
+            content = embed_parser.embed()
+
+            if content:
+                available_tweet_text = semicolon_parser(content[0][0])
+                available_tweet_is_RT = content[1][0]
+                available_tweet_info = semicolon_parser(content[2][0])
+
+        self._add_field("available_tweet_text", available_tweet_text)
+        self._add_field("available_tweet_is_RT", available_tweet_is_RT)
+        self._add_field("available_tweet_info", available_tweet_info)
 
         # TODO: JSON Issue
         # parsed_text_json = ""
index 07e9903f6b4da5ea5fdaf82ff86c4eecb856a767..83f04053732e879ac05dbd5418a83136bcc02a13 100644 (file)
@@ -16,6 +16,7 @@ class WaybackTweets:
     :param timestamp_to: The timestamp to stop retrieving tweets at.
     :param limit: The maximum number of results to return.
     :param offset: The number of lines to skip in the results.
+    :param matchType: Results matching a certain prefix, a certain host or all subdomains. # noqa: E501
     """
 
     def __init__(
@@ -26,6 +27,7 @@ class WaybackTweets:
         timestamp_to: str,
         limit: int,
         offset: int,
+        matchtype: str,
     ):
         self.username = username
         self.collapse = collapse
@@ -33,6 +35,7 @@ class WaybackTweets:
         self.timestamp_to = timestamp_to
         self.limit = limit
         self.offset = offset
+        self.matchtype = matchtype
 
     def get(self) -> Optional[Dict[str, Any]]:
         """
@@ -42,8 +45,13 @@ class WaybackTweets:
         :returns: The response from the CDX API in JSON format, if successful.
         """
         url = "https://web.archive.org/cdx/search/cdx"
+
+        status = "/status/*"
+        if self.matchtype != "exact":
+            status = ""
+
         params = {
-            "url": f"https://twitter.com/{self.username}/status/*",
+            "url": f"https://twitter.com/{self.username}{status}",
             "output": "json",
         }
 
@@ -62,6 +70,9 @@ class WaybackTweets:
         if self.offset:
             params["offset"] = self.offset
 
+        if self.matchtype:
+            params["matchType"] = self.matchtype
+
         try:
             response = get_response(url=url, params=params)
 
@@ -71,7 +82,7 @@ class WaybackTweets:
             rprint("[red]Connection to web.archive.org timed out.")
         except exceptions.ConnectionError:
             rprint(
-                "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded."  # noqa: E501
+                "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
             )
         except exceptions.HTTPError:
             rprint(
index 96509c5716fc1e9256c6a059e11ce5a4166ec244..987bac2a554a00807b65937e5b8aebb08b0fe077 100644 (file)
@@ -40,13 +40,22 @@ from waybacktweets.utils.utils import parse_date
     default=None,
     help="Filtering by date range up to this date. Format: YYYYmmdd",
 )
-@click.option("--limit", type=int, default=None, help="Query result limits.")
+@click.option(
+    "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits."
+)
 @click.option(
     "--offset",
     type=int,
+    metavar="INTEGER",
     default=None,
     help="Allows for a simple way to scroll through the results.",
 )
+@click.option(
+    "--matchtype",
+    type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
+    default=None,
+    help="Results matching a certain prefix, a certain host or all subdomains. Default: exact",  # noqa: E501
+)
 def cli(
     username: str,
     collapse: Optional[str],
@@ -54,6 +63,7 @@ def cli(
     timestamp_to: Optional[str],
     limit: Optional[int],
     offset: Optional[int],
+    matchtype: Optional[str],
 ) -> None:
     """
     Retrieves archived tweets CDX data from the Wayback Machine,
@@ -63,7 +73,7 @@ def cli(
     """
     try:
         api = WaybackTweets(
-            username, collapse, timestamp_from, timestamp_to, limit, offset
+            username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
         )
 
         print("Making a request to the Internet Archive...")
@@ -77,7 +87,6 @@ def cli(
                 "archived_tweet_url",
                 "parsed_tweet_url",
                 "parsed_archived_tweet_url",
-                # "parsed_tweet_text_mimetype_json", # TODO: JSON Issue
                 "available_tweet_text",
                 "available_tweet_is_RT",
                 "available_tweet_info",
index 50ea728f7d6ae44aa9a2e32000479e18dca0da91..5163b336ceed658f4ea377dcaa8b0a7c575835e8 100644 (file)
@@ -179,3 +179,20 @@ def parse_date(
         return date.strftime("%Y%m%d")
     except ValueError:
         raise click.BadParameter("Date must be in format YYYYmmdd")
+
+
+def is_tweet_url(twitter_url: str) -> bool:
+    """
+    Checks if the provided URL is a Twitter status URL.
+
+    This function checks if the provided URL contains "/status/" exactly once,
+    which is a common pattern in Twitter status URLs.
+
+    :param twitter_url: The URL to check.
+
+    :returns: True if the URL is a Twitter status URL, False otherwise.
+    """
+    if twitter_url.count("/status/") == 1:
+        return True
+
+    return False