From 9c459f0e049f0431215d5692946ec9780d438f79 Mon Sep 17 00:00:00 2001 From: Claromes Date: Sat, 8 Jun 2024 14:36:36 -0300 Subject: [PATCH] add cli --- docs/index.md | 2 +- docs/stylesheets/extra.css | 4 ++ mkdocs.yml | 1 + poetry.lock | 122 +------------------------------- pyproject.toml | 7 +- waybacktweets/cli.py | 91 ++++++++++++++++++++++++ waybacktweets/export_tweets.py | 18 +++-- waybacktweets/main.py | 64 ----------------- waybacktweets/parse_tweets.py | 49 +++++++------ waybacktweets/request_tweets.py | 16 +++-- waybacktweets/viz_tweets.py | 2 +- 11 files changed, 145 insertions(+), 231 deletions(-) create mode 100644 waybacktweets/cli.py delete mode 100644 waybacktweets/main.py diff --git a/docs/index.md b/docs/index.md index 93adca9..0c0afb0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,4 +2,4 @@ # Wayback Tweets -Commands and Python functions for retrieving archived tweets +Retrieves archived tweets' CDX data from the Wayback Machine, performs necessary parsing, and saves the data. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 68a1a00..93f5b9e 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -15,6 +15,10 @@ max-height: none; } +.magnifyingglass { + scale: 1.2; +} + .md-source__icon svg { scale: 1.5; } diff --git a/mkdocs.yml b/mkdocs.yml index ca3905e..d744b2c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,6 +3,7 @@ repo_url: https://github.com/claromes/waybacktweets/ repo_name: claromes/waybacktweets edit_uri: tree/main/docs site_author: Claromes +site_description: Retrieves archived tweets' CDX data from the Wayback Machine copyright: Copyright © 2023 - 2024 Claromes · Icons by TheDoodleLibrary theme: diff --git a/poetry.lock b/poetry.lock index 5b3859e..ae31e45 100644 --- a/poetry.lock +++ b/poetry.lock @@ -159,70 +159,6 @@ files = [ {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"}, ] -[[package]] -name = "cffi" -version = "1.16.0" -description = "Foreign Function Interface for Python calling C code." -optional = false -python-versions = ">=3.8" -files = [ - {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"}, - {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"}, - {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"}, - {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"}, - {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"}, - {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"}, - {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"}, - {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"}, - {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"}, - {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"}, - {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"}, - {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"}, - {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"}, - {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"}, - {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"}, - {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"}, - {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"}, - {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"}, - {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"}, - {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"}, - {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"}, - {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"}, - {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"}, - {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"}, - {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"}, - {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"}, - {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"}, - {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"}, - {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"}, - {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"}, - {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"}, - {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"}, -] - -[package.dependencies] -pycparser = "*" - [[package]] name = "cfgv" version = "3.4.0" @@ -946,20 +882,6 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] -[[package]] -name = "outcome" -version = "1.3.0.post0" -description = "Capture the outcome of Python function calls." -optional = false -python-versions = ">=3.7" -files = [ - {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"}, - {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"}, -] - -[package.dependencies] -attrs = ">=19.2.0" - [[package]] name = "packaging" version = "23.2" @@ -1257,17 +1179,6 @@ files = [ {file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"}, ] -[[package]] -name = "pycparser" -version = "2.22" -description = "C parser in Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, - {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, -] - [[package]] name = "pydeck" version = "0.9.1" @@ -1712,17 +1623,6 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] -[[package]] -name = "sortedcontainers" -version = "2.4.0" -description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" -optional = false -python-versions = "*" -files = [ - {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, - {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, -] - [[package]] name = "streamlit" version = "1.27.0" @@ -1830,26 +1730,6 @@ files = [ {file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"}, ] -[[package]] -name = "trio" -version = "0.25.1" -description = "A friendly Python library for async concurrency and I/O" -optional = false -python-versions = ">=3.8" -files = [ - {file = "trio-0.25.1-py3-none-any.whl", hash = "sha256:e42617ba091e7b2e50c899052e83a3c403101841de925187f61e7b7eaebdf3fb"}, - {file = "trio-0.25.1.tar.gz", hash = "sha256:9f5314f014ea3af489e77b001861c535005c3858d38ec46b6b071ebfa339d7fb"}, -] - -[package.dependencies] -attrs = ">=23.2.0" -cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""} -exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} -idna = "*" -outcome = "*" -sniffio = ">=1.3.0" -sortedcontainers = "*" - [[package]] name = "typing-extensions" version = "4.12.2" @@ -1999,4 +1879,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0" -content-hash = "b0a265c676a14faabce90d2227ef14170b89a647f70e7c7c0f381319b0d9840f" +content-hash = "42b006d1fdee1ed5cf06d63c01ef2f8b4fa94839a0ebf52a9e16d3e85c4ed202" diff --git a/pyproject.toml b/pyproject.toml index 607125e..05bf063 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "waybacktweets" version = "1.0" -description = "" +description = "Retrieves archived tweets' CDX data from the Wayback Machine" authors = ["Claromes "] license = "GPL-3.0" readme = "README.md" @@ -12,7 +12,7 @@ requests = "^2.30.0" streamlit = "1.27.0" rich = "^13.6.0" httpx = "^0.27.0" -trio = "^0.25.1" +click = "^8.1.7" [tool.poetry.group.docs.dependencies] mkdocs = "^1.6.0" @@ -32,6 +32,9 @@ profile = "black" max-line-length = 88 extend-ignore = ["E203", "E701"] +[tool.poetry.scripts] +wbt = 'waybacktweets.cli:cli' + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/waybacktweets/cli.py b/waybacktweets/cli.py new file mode 100644 index 0000000..ebaebc8 --- /dev/null +++ b/waybacktweets/cli.py @@ -0,0 +1,91 @@ +""" +CLI functions for retrieving archived tweets. +""" + +from datetime import datetime + +import click +from rich import print as rprint + +from waybacktweets.export_tweets import TweetsExporter +from waybacktweets.parse_tweets import TweetsParser +from waybacktweets.request_tweets import WaybackTweets + + +def parse_date(ctx, param, value): + if value is None: + return None + + date = datetime.strptime(value, "%Y%m%d") + return date.strftime("%Y%m%d") + + +@click.command() +@click.argument("username", type=str) +@click.option( + "--unique", + type=bool, + default=False, + help="Only show unique URLs. Filtering by the collapse option using the urlkey field.", # noqa: E501 +) +@click.option( + "--from", + "timestamp_from", + type=click.UNPROCESSED, + callback=parse_date, + default=None, + help="Filtering by date range from this date.", +) +@click.option( + "--to", + "timestamp_to", + type=click.UNPROCESSED, + callback=parse_date, + default=None, + help="Filtering by date range up to this date.", +) +@click.option("--limit", type=int, default=None, help="Query result limits.") +def cli(username, unique, timestamp_from, timestamp_to, limit): + """ + Retrieves archived tweets' CDX data from the Wayback Machine, + performs necessary parsing, and saves the data. + + USERNAME: The Twitter username without @. + """ + try: + api = WaybackTweets(username, unique, timestamp_from, timestamp_to, limit) + archived_tweets = api.get() + + if archived_tweets: + field_options = [ + "archived_urlkey", + "archived_timestamp", + "original_tweet_url", + "archived_tweet_url", + "parsed_tweet_url", + "parsed_archived_tweet_url", + "parsed_tweet_text_mimetype_json", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", + "archived_statuscode", + "archived_digest", + "archived_length", + ] + + parser = TweetsParser(archived_tweets, username, field_options) + parsed_tweets = parser.parse() + + exporter = TweetsExporter(parsed_tweets, username, field_options) + + exporter.save_to_csv() + exporter.save_to_json() + exporter.save_to_html() + + except TypeError as e: + rprint(f"[red]{e}") + finally: + rprint( + "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues" # noqa: E501 + ) diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index b43410d..a62204f 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -3,18 +3,17 @@ import os import re import pandas as pd -from rich import print as rprint -from viz_tweets import HTMLTweetsVisualizer + +from waybacktweets.viz_tweets import HTMLTweetsVisualizer class TweetsExporter: """Handles the exporting of parsed archived tweets.""" - def __init__(self, data, username, metadata_options, ascending): + def __init__(self, data, username, field_options): self.data = data self.username = username - self.metadata_options = metadata_options - self.ascending = ascending + self.field_options = field_options self.formatted_datetime = self.datetime_now() self.filename = f"{self.username}_tweets_{self.formatted_datetime}" self.dataframe = self.create_dataframe(self) @@ -50,8 +49,7 @@ class TweetsExporter: """Creates a DataFrame from the transposed data.""" data_transposed = self.transpose_matrix(self.data) - df = pd.DataFrame(data_transposed, columns=self.metadata_options) - df = df.sort_values(by="archived_timestamp", ascending=self.ascending) + df = pd.DataFrame(data_transposed, columns=self.field_options) return df @@ -60,14 +58,14 @@ class TweetsExporter: csv_file_path = f"{self.filename}.csv" self.dataframe.to_csv(csv_file_path, index=False) - rprint(f"[blue]Saved to {csv_file_path}") + print(f"Saved to {csv_file_path}") def save_to_json(self): """Saves the DataFrame to a JSON file.""" json_file_path = f"{self.filename}.json" self.dataframe.to_json(json_file_path, orient="records", lines=False) - rprint(f"[blue]Saved to {json_file_path}") + print(f"Saved to {json_file_path}") def save_to_html(self): """Saves the DataFrame to an HTML file.""" @@ -83,4 +81,4 @@ class TweetsExporter: html_content = html.generate() html.save(html_content) - rprint(f"[blue]Saved to {html_file_path}") + print(f"Saved to {html_file_path}") diff --git a/waybacktweets/main.py b/waybacktweets/main.py deleted file mode 100644 index d7d5b8a..0000000 --- a/waybacktweets/main.py +++ /dev/null @@ -1,64 +0,0 @@ -""" -Main function for retrieving archived tweets. -""" - -import trio -from export_tweets import TweetsExporter -from parse_tweets import TweetsParser -from request_tweets import WaybackTweets -from rich import print as rprint - -username = "claromes" -unique = True -datetime_from = None -datetime_to = None -ascending = False - - -async def main(): - """ - Invokes the classes to retrieve archived tweets, perform necessary parsing, - and save the data. - """ - try: - api = WaybackTweets(username, unique, datetime_from, datetime_to) - archived_tweets = await api.get() - - if archived_tweets: - metadata_options = [ - "archived_urlkey", - "archived_timestamp", - "original_tweet_url", - "archived_tweet_url", - "parsed_tweet_url", - "parsed_archived_tweet_url", - "parsed_tweet_text_mimetype_json", - "available_tweet_text", - "available_tweet_is_RT", - "available_tweet_username", - "archived_mimetype", - "archived_statuscode", - "archived_digest", - "archived_length", - ] - - parser = TweetsParser(archived_tweets, username, metadata_options) - parsed_tweets = parser.parse() - - exporter = TweetsExporter( - parsed_tweets, username, metadata_options, ascending - ) - exporter.save_to_csv() - exporter.save_to_json() - exporter.save_to_html() - - except TypeError as e: - print(e) - finally: - rprint( - "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues" # noqa: E501 - ) - - -if __name__ == "__main__": - trio.run(main) diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py index 57985f1..6e4c2a3 100644 --- a/waybacktweets/parse_tweets.py +++ b/waybacktweets/parse_tweets.py @@ -5,7 +5,8 @@ from urllib.parse import unquote import httpx from rich import print as rprint from rich.progress import Progress -from utils import ( + +from waybacktweets.utils import ( check_double_status, check_pattern_tweet, clean_tweet_url, @@ -60,7 +61,7 @@ class TwitterEmbed: return tweet_content, is_RT, user_info except Exception: - rprint("[yellow]Error parsing the tweet, but the metadata was saved.") + rprint("[yellow]Error parsing the tweet, but the CDX data was saved.") return None @@ -89,7 +90,7 @@ class JsonParser: return json_data.get("text", json_data) except Exception: rprint( - f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the metadata was saved." # noqa: E501 + f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved." # noqa: E501 ) return "" @@ -98,13 +99,13 @@ class JsonParser: class TweetsParser: """Handles the overall parsing of archived tweets.""" - def __init__(self, archived_tweets_response, username, metadata_options): + def __init__(self, archived_tweets_response, username, field_options): self.archived_tweets_response = archived_tweets_response self.username = username - self.metadata_options = metadata_options - self.parsed_tweets = {option: [] for option in self.metadata_options} + self.field_options = field_options + self.parsed_tweets = {option: [] for option in self.field_options} - def add_metadata(self, key, value): + def add_field(self, key, value): """ Appends a value to a list in the parsed data structure. Defines which data will be structured and saved. @@ -113,7 +114,7 @@ class TweetsParser: self.parsed_tweets[key].append(value) def process_response(self, response): - """Process the archived tweet's response and add the relevant metadata.""" + """Process the archived tweet's response and add the relevant CDX data.""" tweet_remove_char = unquote(response[2]).replace("’", "") cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"') @@ -145,11 +146,9 @@ class TweetsParser: content = embed_parser.embed() if content: - self.add_metadata("available_tweet_text", semicolon_parser(content[0][0])) - self.add_metadata("available_tweet_is_RT", content[1][0]) - self.add_metadata( - "available_tweet_username", semicolon_parser(content[2][0]) - ) + self.add_field("available_tweet_text", semicolon_parser(content[0][0])) + self.add_field("available_tweet_is_RT", content[1][0]) + self.add_field("available_tweet_info", semicolon_parser(content[2][0])) parsed_text_json = "" @@ -159,20 +158,20 @@ class TweetsParser: text_json = json_parser.parse() parsed_text_json = semicolon_parser(text_json) - self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json) - self.add_metadata("archived_urlkey", response[0]) - self.add_metadata("archived_timestamp", response[1]) - self.add_metadata("original_tweet_url", encoded_tweet) - self.add_metadata("archived_tweet_url", encoded_archived_tweet) - self.add_metadata("parsed_tweet_url", encoded_parsed_tweet) - self.add_metadata("parsed_archived_tweet_url", encoded_parsed_archived_tweet) - self.add_metadata("archived_mimetype", response[3]) - self.add_metadata("archived_statuscode", response[4]) - self.add_metadata("archived_digest", response[5]) - self.add_metadata("archived_length", response[6]) + self.add_field("parsed_tweet_text_mimetype_json", parsed_text_json) + self.add_field("archived_urlkey", response[0]) + self.add_field("archived_timestamp", response[1]) + self.add_field("original_tweet_url", encoded_tweet) + self.add_field("archived_tweet_url", encoded_archived_tweet) + self.add_field("parsed_tweet_url", encoded_parsed_tweet) + self.add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet) + self.add_field("archived_mimetype", response[3]) + self.add_field("archived_statuscode", response[4]) + self.add_field("archived_digest", response[5]) + self.add_field("archived_length", response[6]) def parse(self): - """Parses the archived tweets metadata and structures it.""" + """Parses the archived tweets CDX data and structures it.""" with ThreadPoolExecutor(max_workers=10) as executor: futures = { diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index 97c0c13..78ba5a2 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -5,19 +5,19 @@ from rich import print as rprint class WaybackTweets: """Requests data from the Wayback CDX Server API and returns it in JSON format.""" - def __init__(self, username, unique=False, timestamp_from=None, timestamp_to=None): + def __init__(self, username, unique, timestamp_from, timestamp_to, limit): self.username = username self.unique = unique self.timestamp_from = timestamp_from self.timestamp_to = timestamp_to + self.limit = limit - async def get(self): + def get(self): """GET request to the Internet Archive's CDX API to retrieve archived tweets.""" url = "https://web.archive.org/cdx/search/cdx" params = { "url": f"https://twitter.com/{self.username}/status/*", "output": "json", - "limit": 1000, } if self.unique: @@ -29,11 +29,13 @@ class WaybackTweets: if self.timestamp_to: params["to"] = self.timestamp_to - print("Hi, archivist...") + if self.limit: + params["limit"] = self.limit + + print("Making a request to the Internet Archive...") try: - async with httpx.AsyncClient() as client: - response = await client.get(url, params=params) + response = httpx.get(url, params=params) if not (400 <= response.status_code <= 511): return response.json() @@ -46,4 +48,4 @@ class WaybackTweets: "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501 ) except UnboundLocalError as e: - print(e) + rprint(f"[red]{e}") diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py index f394e9b..1e803fa 100644 --- a/waybacktweets/viz_tweets.py +++ b/waybacktweets/viz_tweets.py @@ -52,7 +52,7 @@ class HTMLTweetsVisualizer: html += "
\n" html += f'

Available Tweet Content: {tweet["available_tweet_text"]}

\n' html += f'

Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}

\n' - html += f'

Available Tweet Username: {tweet["available_tweet_username"]}

\n' + html += f'

Available Tweet Username: {tweet["available_tweet_info"]}

\n' if tweet["archived_mimetype"] == "application/json": html += f'

Parsed Tweet Text (application/json): {tweet["parsed_tweet_text_mimetype_json"]}

\n' -- 2.34.1