From: claromes Date: Sun, 25 May 2025 09:57:19 +0000 (-0300) Subject: fix rk, update docs, fix json gen, update html viz X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=d693cb26de14e6c84948e37fb0bc3be5cda781f6;p=waybacktweets.git fix rk, update docs, fix json gen, update html viz --- diff --git a/.gitignore b/.gitignore index a0062e8..a3190b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ *.csv *.json *.html +*.txt + +test.py waybacktweets/__pycache__ waybacktweets/api/__pycache__ diff --git a/README.md b/README.md index 92c461e..25727ce 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # Wayback Tweets -[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12528447.svg)](https://doi.org/10.5281/zenodo.12528447) [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://waybacktweets.streamlit.app) [![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tnaM3rMWpoSHBZ4P_6iHFPjraWRQ3OGe?usp=sharing) - +[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) [![PyPI Downloads](https://static.pepy.tech/badge/waybacktweets)](https://pepy.tech/projects/waybacktweets) Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing (see [Field Options](https://claromes.github.io/waybacktweets/field_options.html)), and saves the data in HTML, for easy viewing of the tweets using the iframe tags, CSV, and JSON formats. @@ -11,21 +10,50 @@ Retrieves archived tweets CDX data from the Wayback Machine, performs necessary pip install waybacktweets ``` -## Quickstart - -### Using Wayback Tweets as a standalone command line tool - -waybacktweets [OPTIONS] USERNAME +## CLI ```shell -waybacktweets --from 20150101 --to 20191231 --limit 250 jack +Usage: waybacktweets [OPTIONS] USERNAME + + USERNAME: The Twitter username without @ + +Options: + -c, --collapse [urlkey|digest|timestamp:XX] + Collapse results based on a field, or a + substring of a field. XX in the timestamp + value ranges from 1 to 14, comparing the + first XX digits of the timestamp field. It + is recommended to use from 4 onwards, to + compare at least by years. + -f, --from DATE Filtering by date range from this date. + Format: YYYYmmdd + -t, --to DATE Filtering by date range up to this date. + Format: YYYYmmdd + -l, --limit INTEGER Query result limits. + -rk, --resumption_key TEXT Allows for a simple way to scroll through + the results. Key to continue the query from + the end of the previous query. + -mt, --matchtype [exact|prefix|host|domain] + Results matching a certain prefix, a certain + host or all subdomains. + -v, --verbose Shows the log. + --version Show the version and exit. + -h, --help Show this message and exit. + + Examples: + + Retrieve all tweets: waybacktweets jack + + With options and verbose output: waybacktweets --from 20200305 --to 20231231 --limit 300 --verbose jack + + Documentation: + + https://claromes.github.io/waybacktweets/ ``` -### Using Wayback Tweets as a Web App - -[Open the application](https://waybacktweets.streamlit.app), a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. +## Module -### Using Wayback Tweets as a Python Module +[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tnaM3rMWpoSHBZ4P_6iHFPjraWRQ3OGe?usp=sharing) ```python from waybacktweets import WaybackTweets, TweetsParser, TweetsExporter @@ -37,10 +65,21 @@ archived_tweets = api.get() if archived_tweets: field_options = [ + "archived_urlkey", "archived_timestamp", - "original_tweet_url", + "parsed_archived_timestamp", "archived_tweet_url", + "parsed_archived_tweet_url", + "original_tweet_url", + "parsed_tweet_url", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", "archived_statuscode", + "archived_digest", + "archived_length", + "resumption_key", ] parser = TweetsParser(archived_tweets, USERNAME, field_options) @@ -48,8 +87,19 @@ if archived_tweets: exporter = TweetsExporter(parsed_tweets, USERNAME, field_options) exporter.save_to_csv() + exporter.save_to_json() + exporter.save_to_html() ``` +## Web App + +[![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://waybacktweets.streamlit.app) + +A prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. + +> [!NOTE] +> Starting from version 1.0, the web app will not receive all updates from the official package. To access all features, prefer the package via PyPI. + ## Documentation - [Wayback Tweets documentation](https://claromes.github.io/waybacktweets) @@ -57,9 +107,10 @@ if archived_tweets: ## Acknowledgements -- Tristan Lee (Bellingcat's Data Scientist) for the idea of the application. +- Tristan Lee (Bellingcat's Data Scientist) for the idea. - Jessica Smith (Snowflake's Community Growth Specialist) and Streamlit/Snowflake team for the additional server resources on Streamlit Cloud. -- OSINT Community for recommending the application. +- OSINT Community for recommending the package and the application. -> [!NOTE] -> If the Streamlit application is down, please check the [Streamlit Cloud Status](https://www.streamlitstatus.com/). +## License + +[GPL-3.0](LICENSE.md) diff --git a/docs/conf.py b/docs/conf.py index b6304e9..ff4ae8f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -5,7 +5,7 @@ from pallets_sphinx_themes import ProjectLink, get_version project = "Wayback Tweets" release, version = get_version("waybacktweets") rst_epilog = f".. |release| replace:: v{release}" -copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License · Pre-release: v{release}" # noqa: E501 +copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License · Release: v{release}" # noqa: E501 author = "Claromes" # -- General configuration --------------------------------------------------- diff --git a/docs/field_options.rst b/docs/field_options.rst index 3c4a0ae..09da2e1 100644 --- a/docs/field_options.rst +++ b/docs/field_options.rst @@ -40,3 +40,5 @@ The package performs several parses to facilitate the analysis of archived tweet - ``archived_digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string. - ``archived_length``: (`int`) The compressed byte size of the corresponding WARC record, which includes WARC headers, HTTP headers, and content payload. + +- ``resumption_key``: (`str`) Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query. diff --git a/docs/handson.rst b/docs/handson.rst index c6b9b2b..746d495 100644 --- a/docs/handson.rst +++ b/docs/handson.rst @@ -19,4 +19,3 @@ Hands-On Examples :target: https://colab.research.google.com/drive/1tnaM3rMWpoSHBZ4P_6iHFPjraWRQ3OGe?usp=sharing :alt: Open In Collab -.. raw:: html diff --git a/docs/index.rst b/docs/index.rst index c06b4d1..4c1ecb8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,23 +39,21 @@ Command-Line Interface cli -Streamlit Web App -------------------- +API Reference +--------------- .. toctree:: :maxdepth: 2 - streamlit - + api -API Reference ---------------- +Streamlit Web App +------------------- .. toctree:: :maxdepth: 2 - api - + streamlit Additional Information ----------------------- diff --git a/docs/outputs.rst b/docs/outputs.rst index 365db72..7b700d0 100644 --- a/docs/outputs.rst +++ b/docs/outputs.rst @@ -14,10 +14,14 @@ This format allows for easy viewing of the archived tweets, through the use of t - ``original_tweet_url``: (`str`) The original tweet URL. -- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`. +- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs when necessary. Refer to the :ref:`utils` for more details. Additionally, other fields are displayed. +.. note:: + + The iframes (accordions) are best viewed in Firefox. + CSV -------- diff --git a/docs/quickstart.rst b/docs/quickstart.rst index d05e5c7..209b11a 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -12,13 +12,6 @@ waybacktweets [OPTIONS] USERNAME waybacktweets --from 20150101 --to 20191231 --limit 250 jack -Web App -------------- - -Using Wayback Tweets as a Streamlit Web App. - -`Open the application `_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. - Module ------------- @@ -35,10 +28,21 @@ Using Wayback Tweets as a Python Module. if archived_tweets: field_options = [ + "archived_urlkey", "archived_timestamp", - "original_tweet_url", + "parsed_archived_timestamp", "archived_tweet_url", + "parsed_archived_tweet_url", + "original_tweet_url", + "parsed_tweet_url", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", "archived_statuscode", + "archived_digest", + "archived_length", + "resumption_key", ] parser = TweetsParser(archived_tweets, USERNAME, field_options) @@ -46,3 +50,12 @@ Using Wayback Tweets as a Python Module. exporter = TweetsExporter(parsed_tweets, USERNAME, field_options) exporter.save_to_csv() + exporter.save_to_json() + exporter.save_to_html() + +Web App +------------- + +Using Wayback Tweets as a Streamlit Web App. + +`Open the application `_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud. diff --git a/docs/streamlit.rst b/docs/streamlit.rst index bf748e2..e04bd17 100644 --- a/docs/streamlit.rst +++ b/docs/streamlit.rst @@ -1,6 +1,10 @@ Web App ========= +.. note:: + + Starting from version 1.0, the web app will not receive all updates from the official package. To access all features, prefer the package via PyPI. + The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool. `Open the application `_. @@ -13,8 +17,6 @@ Filters - Limit: Query result limits. -- Resumption Key: Allows for a simple way to scroll through the results. Key to continue the query from the end of the previous query. - - Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix`` diff --git a/pyproject.toml b/pyproject.toml index 74c9fc7..89e17e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "waybacktweets" -version = "1.0rc1" +version = "1.0" description = "Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data." authors = ["Claromes "] license = "GPLv3" @@ -8,6 +8,7 @@ readme = "README.md" repository = "https://github.com/claromes/waybacktweets" keywords = [ "twitter", + "X", "tweet", "internet-archive", "wayback-machine", @@ -16,13 +17,14 @@ keywords = [ "command-line", ] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Natural Language :: English", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Software Development", "Topic :: Utilities", ] diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index d390801..ed95303 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -138,7 +138,7 @@ def main( matchtype, ) - print(f"Retrieving the archived tweets of @{username}...") + print("Retrieving...") archived_tweets = api.get() if archived_tweets: diff --git a/waybacktweets/api/export.py b/waybacktweets/api/export.py index 6524bfe..b5b1b80 100644 --- a/waybacktweets/api/export.py +++ b/waybacktweets/api/export.py @@ -3,7 +3,6 @@ Exports the parsed archived tweets. """ import datetime -import os import re from typing import Any, Dict, List, Optional @@ -93,6 +92,17 @@ class TweetsExporter: print(f"Saved to {csv_file_path}") + def generate_json(self) -> str: + """ + Generates JSON data from the DataFrame (without saving to a file). + + Returns: + The JSON-formatted string of the DataFrame. + """ + + json_data = self.dataframe.to_json(orient="records", lines=False) + return json_data + def save_to_json(self) -> None: """ Saves the DataFrame to a JSON file. @@ -106,14 +116,11 @@ class TweetsExporter: """ Saves the DataFrame to an HTML file. """ - json_path = f"{self.filename}.json" - - if not os.path.exists(json_path): - self.save_to_json() + json_data = self.generate_json() html_file_path = f"{self.filename}.html" - html = HTMLTweetsVisualizer(self.username, json_path, html_file_path) + html = HTMLTweetsVisualizer(self.username, json_data, html_file_path) html_content = html.generate() html.save(html_content) diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py index 35f26ba..00cdc89 100644 --- a/waybacktweets/api/parse.py +++ b/waybacktweets/api/parse.py @@ -173,13 +173,11 @@ class TweetsParser: if not all(option in FIELD_OPTIONS for option in field_options): raise ValueError("Some field options are not valid.") - self.archived_tweets_response = archived_tweets_response + self.archived_tweets_response = archived_tweets_response[0] self.username = username self.field_options = field_options self.parsed_tweets = {option: [] for option in self.field_options} - - if "resumption_key" not in self.parsed_tweets: - self.parsed_tweets["resumption_key"] = [] + self.show_resume_key = archived_tweets_response[1]["show_resume_key"] self._add_resumption_key() @@ -198,8 +196,12 @@ class TweetsParser: if not self.archived_tweets_response: raise ValueError("The list of archived tweet responses is empty.") - resumption_key = self.archived_tweets_response[-1][0] - self.parsed_tweets["resumption_key"].append(resumption_key) + resumption_key = ( + self.archived_tweets_response[-1][0] if self.show_resume_key else None + ) + if self.show_resume_key and "resumption_key" in self.parsed_tweets: + self.parsed_tweets["resumption_key"] = [] + self.parsed_tweets["resumption_key"].append(resumption_key) def _add_field(self, key: str, value: Any) -> None: """ @@ -317,8 +319,9 @@ class TweetsParser: if print_progress: progress.update(task, advance=1) - rprint( - f"[blue]Resumption Key: [bold]{self.archived_tweets_response[-1][0]}[/bold]\nUse the Resumption Key (--resumption_key, -rk) option to continue the query from where the previous one ended. This allows you to break a large query into smaller queries more efficiently.[/blue]\n" # noqa: E501 - ) + if self.show_resume_key: + rprint( + f'[blue]Resumption Key: [bold]{self.archived_tweets_response[-1][0]}[/bold][/blue]\nUse this Resumption Key option (--resumption_key in the CLI or "resumption_key" in field_options via the API) to continue the query from where the previous one left off. This allows you to split a large query into smaller, more efficient ones.\n' # noqa: E501 + ) return self.parsed_tweets diff --git a/waybacktweets/api/request.py b/waybacktweets/api/request.py index a503566..f44c03c 100644 --- a/waybacktweets/api/request.py +++ b/waybacktweets/api/request.py @@ -58,13 +58,13 @@ class WaybackTweets: """ # noqa: E501 url = "https://web.archive.org/cdx/search/cdx" - wildcard_pathname = "/*" - if self.matchtype: - wildcard_pathname = "" + wildcard_pathname = "" if self.matchtype else "/*" + + show_resume_key = bool(self.limit) params = { "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}", - "showResumeKey": "true", + "showResumeKey": show_resume_key, "output": "json", } @@ -88,7 +88,7 @@ class WaybackTweets: try: response = get_response(url=url, params=params) - return response.json() + return response.json(), {"show_resume_key": show_resume_key} except ReadTimeoutError: if config.verbose: rprint("[red]Connection to web.archive.org timed out.") diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py index f679ca5..3c31c0f 100644 --- a/waybacktweets/api/visualize.py +++ b/waybacktweets/api/visualize.py @@ -66,7 +66,7 @@ class HTMLTweetsVisualizer: html += ( '\n' ) - html += f"@{self.username}'s archived tweets\n" + html += f"Wayback Tweets from @{self.username}\n" # Adds styling html += "