# Wayback Tweets
-Commands and Python functions for retrieving archived tweets
+Retrieves archived tweets' CDX data from the Wayback Machine, performs necessary parsing, and saves the data.
max-height: none;
}
+.magnifyingglass {
+ scale: 1.2;
+}
+
.md-source__icon svg {
scale: 1.5;
}
repo_name: claromes/waybacktweets
edit_uri: tree/main/docs
site_author: Claromes
+site_description: Retrieves archived tweets' CDX data from the Wayback Machine
copyright: Copyright © 2023 - 2024 Claromes · Icons by TheDoodleLibrary
theme:
{file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
]
-[[package]]
-name = "cffi"
-version = "1.16.0"
-description = "Foreign Function Interface for Python calling C code."
-optional = false
-python-versions = ">=3.8"
-files = [
- {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
- {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
- {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
- {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
- {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
- {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
- {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
- {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
- {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
- {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
- {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
- {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
- {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
- {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
- {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
- {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
- {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
- {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
- {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
- {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
- {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
- {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
- {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
- {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
- {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
- {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
- {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
- {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
- {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
- {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
- {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
- {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
- {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
- {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
- {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
- {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
- {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
- {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
- {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
- {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
- {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
- {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
- {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
- {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
- {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
- {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
- {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
- {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
- {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
- {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
- {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
- {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
-]
-
-[package.dependencies]
-pycparser = "*"
-
[[package]]
name = "cfgv"
version = "3.4.0"
{file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
]
-[[package]]
-name = "outcome"
-version = "1.3.0.post0"
-description = "Capture the outcome of Python function calls."
-optional = false
-python-versions = ">=3.7"
-files = [
- {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
- {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
-]
-
-[package.dependencies]
-attrs = ">=19.2.0"
-
[[package]]
name = "packaging"
version = "23.2"
{file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"},
]
-[[package]]
-name = "pycparser"
-version = "2.22"
-description = "C parser in Python"
-optional = false
-python-versions = ">=3.8"
-files = [
- {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
- {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
-]
-
[[package]]
name = "pydeck"
version = "0.9.1"
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
]
-[[package]]
-name = "sortedcontainers"
-version = "2.4.0"
-description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
-optional = false
-python-versions = "*"
-files = [
- {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
- {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
-]
-
[[package]]
name = "streamlit"
version = "1.27.0"
{file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"},
]
-[[package]]
-name = "trio"
-version = "0.25.1"
-description = "A friendly Python library for async concurrency and I/O"
-optional = false
-python-versions = ">=3.8"
-files = [
- {file = "trio-0.25.1-py3-none-any.whl", hash = "sha256:e42617ba091e7b2e50c899052e83a3c403101841de925187f61e7b7eaebdf3fb"},
- {file = "trio-0.25.1.tar.gz", hash = "sha256:9f5314f014ea3af489e77b001861c535005c3858d38ec46b6b071ebfa339d7fb"},
-]
-
-[package.dependencies]
-attrs = ">=23.2.0"
-cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
-exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
-idna = "*"
-outcome = "*"
-sniffio = ">=1.3.0"
-sortedcontainers = "*"
-
[[package]]
name = "typing-extensions"
version = "4.12.2"
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0"
-content-hash = "b0a265c676a14faabce90d2227ef14170b89a647f70e7c7c0f381319b0d9840f"
+content-hash = "42b006d1fdee1ed5cf06d63c01ef2f8b4fa94839a0ebf52a9e16d3e85c4ed202"
[tool.poetry]
name = "waybacktweets"
version = "1.0"
-description = ""
+description = "Retrieves archived tweets' CDX data from the Wayback Machine"
authors = ["Claromes <support@claromes.com>"]
license = "GPL-3.0"
readme = "README.md"
streamlit = "1.27.0"
rich = "^13.6.0"
httpx = "^0.27.0"
-trio = "^0.25.1"
+click = "^8.1.7"
[tool.poetry.group.docs.dependencies]
mkdocs = "^1.6.0"
max-line-length = 88
extend-ignore = ["E203", "E701"]
+[tool.poetry.scripts]
+wbt = 'waybacktweets.cli:cli'
+
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
--- /dev/null
+"""
+CLI functions for retrieving archived tweets.
+"""
+
+from datetime import datetime
+
+import click
+from rich import print as rprint
+
+from waybacktweets.export_tweets import TweetsExporter
+from waybacktweets.parse_tweets import TweetsParser
+from waybacktweets.request_tweets import WaybackTweets
+
+
+def parse_date(ctx, param, value):
+ if value is None:
+ return None
+
+ date = datetime.strptime(value, "%Y%m%d")
+ return date.strftime("%Y%m%d")
+
+
+@click.command()
+@click.argument("username", type=str)
+@click.option(
+ "--unique",
+ type=bool,
+ default=False,
+ help="Only show unique URLs. Filtering by the collapse option using the urlkey field.", # noqa: E501
+)
+@click.option(
+ "--from",
+ "timestamp_from",
+ type=click.UNPROCESSED,
+ callback=parse_date,
+ default=None,
+ help="Filtering by date range from this date.",
+)
+@click.option(
+ "--to",
+ "timestamp_to",
+ type=click.UNPROCESSED,
+ callback=parse_date,
+ default=None,
+ help="Filtering by date range up to this date.",
+)
+@click.option("--limit", type=int, default=None, help="Query result limits.")
+def cli(username, unique, timestamp_from, timestamp_to, limit):
+ """
+ Retrieves archived tweets' CDX data from the Wayback Machine,
+ performs necessary parsing, and saves the data.
+
+ USERNAME: The Twitter username without @.
+ """
+ try:
+ api = WaybackTweets(username, unique, timestamp_from, timestamp_to, limit)
+ archived_tweets = api.get()
+
+ if archived_tweets:
+ field_options = [
+ "archived_urlkey",
+ "archived_timestamp",
+ "original_tweet_url",
+ "archived_tweet_url",
+ "parsed_tweet_url",
+ "parsed_archived_tweet_url",
+ "parsed_tweet_text_mimetype_json",
+ "available_tweet_text",
+ "available_tweet_is_RT",
+ "available_tweet_info",
+ "archived_mimetype",
+ "archived_statuscode",
+ "archived_digest",
+ "archived_length",
+ ]
+
+ parser = TweetsParser(archived_tweets, username, field_options)
+ parsed_tweets = parser.parse()
+
+ exporter = TweetsExporter(parsed_tweets, username, field_options)
+
+ exporter.save_to_csv()
+ exporter.save_to_json()
+ exporter.save_to_html()
+
+ except TypeError as e:
+ rprint(f"[red]{e}")
+ finally:
+ rprint(
+ "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues" # noqa: E501
+ )
import re
import pandas as pd
-from rich import print as rprint
-from viz_tweets import HTMLTweetsVisualizer
+
+from waybacktweets.viz_tweets import HTMLTweetsVisualizer
class TweetsExporter:
"""Handles the exporting of parsed archived tweets."""
- def __init__(self, data, username, metadata_options, ascending):
+ def __init__(self, data, username, field_options):
self.data = data
self.username = username
- self.metadata_options = metadata_options
- self.ascending = ascending
+ self.field_options = field_options
self.formatted_datetime = self.datetime_now()
self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
self.dataframe = self.create_dataframe(self)
"""Creates a DataFrame from the transposed data."""
data_transposed = self.transpose_matrix(self.data)
- df = pd.DataFrame(data_transposed, columns=self.metadata_options)
- df = df.sort_values(by="archived_timestamp", ascending=self.ascending)
+ df = pd.DataFrame(data_transposed, columns=self.field_options)
return df
csv_file_path = f"{self.filename}.csv"
self.dataframe.to_csv(csv_file_path, index=False)
- rprint(f"[blue]Saved to {csv_file_path}")
+ print(f"Saved to {csv_file_path}")
def save_to_json(self):
"""Saves the DataFrame to a JSON file."""
json_file_path = f"{self.filename}.json"
self.dataframe.to_json(json_file_path, orient="records", lines=False)
- rprint(f"[blue]Saved to {json_file_path}")
+ print(f"Saved to {json_file_path}")
def save_to_html(self):
"""Saves the DataFrame to an HTML file."""
html_content = html.generate()
html.save(html_content)
- rprint(f"[blue]Saved to {html_file_path}")
+ print(f"Saved to {html_file_path}")
+++ /dev/null
-"""
-Main function for retrieving archived tweets.
-"""
-
-import trio
-from export_tweets import TweetsExporter
-from parse_tweets import TweetsParser
-from request_tweets import WaybackTweets
-from rich import print as rprint
-
-username = "claromes"
-unique = True
-datetime_from = None
-datetime_to = None
-ascending = False
-
-
-async def main():
- """
- Invokes the classes to retrieve archived tweets, perform necessary parsing,
- and save the data.
- """
- try:
- api = WaybackTweets(username, unique, datetime_from, datetime_to)
- archived_tweets = await api.get()
-
- if archived_tweets:
- metadata_options = [
- "archived_urlkey",
- "archived_timestamp",
- "original_tweet_url",
- "archived_tweet_url",
- "parsed_tweet_url",
- "parsed_archived_tweet_url",
- "parsed_tweet_text_mimetype_json",
- "available_tweet_text",
- "available_tweet_is_RT",
- "available_tweet_username",
- "archived_mimetype",
- "archived_statuscode",
- "archived_digest",
- "archived_length",
- ]
-
- parser = TweetsParser(archived_tweets, username, metadata_options)
- parsed_tweets = parser.parse()
-
- exporter = TweetsExporter(
- parsed_tweets, username, metadata_options, ascending
- )
- exporter.save_to_csv()
- exporter.save_to_json()
- exporter.save_to_html()
-
- except TypeError as e:
- print(e)
- finally:
- rprint(
- "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues" # noqa: E501
- )
-
-
-if __name__ == "__main__":
- trio.run(main)
import httpx
from rich import print as rprint
from rich.progress import Progress
-from utils import (
+
+from waybacktweets.utils import (
check_double_status,
check_pattern_tweet,
clean_tweet_url,
return tweet_content, is_RT, user_info
except Exception:
- rprint("[yellow]Error parsing the tweet, but the metadata was saved.")
+ rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
return None
return json_data.get("text", json_data)
except Exception:
rprint(
- f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the metadata was saved." # noqa: E501
+ f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved." # noqa: E501
)
return ""
class TweetsParser:
"""Handles the overall parsing of archived tweets."""
- def __init__(self, archived_tweets_response, username, metadata_options):
+ def __init__(self, archived_tweets_response, username, field_options):
self.archived_tweets_response = archived_tweets_response
self.username = username
- self.metadata_options = metadata_options
- self.parsed_tweets = {option: [] for option in self.metadata_options}
+ self.field_options = field_options
+ self.parsed_tweets = {option: [] for option in self.field_options}
- def add_metadata(self, key, value):
+ def add_field(self, key, value):
"""
Appends a value to a list in the parsed data structure.
Defines which data will be structured and saved.
self.parsed_tweets[key].append(value)
def process_response(self, response):
- """Process the archived tweet's response and add the relevant metadata."""
+ """Process the archived tweet's response and add the relevant CDX data."""
tweet_remove_char = unquote(response[2]).replace("’", "")
cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
content = embed_parser.embed()
if content:
- self.add_metadata("available_tweet_text", semicolon_parser(content[0][0]))
- self.add_metadata("available_tweet_is_RT", content[1][0])
- self.add_metadata(
- "available_tweet_username", semicolon_parser(content[2][0])
- )
+ self.add_field("available_tweet_text", semicolon_parser(content[0][0]))
+ self.add_field("available_tweet_is_RT", content[1][0])
+ self.add_field("available_tweet_info", semicolon_parser(content[2][0]))
parsed_text_json = ""
text_json = json_parser.parse()
parsed_text_json = semicolon_parser(text_json)
- self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json)
- self.add_metadata("archived_urlkey", response[0])
- self.add_metadata("archived_timestamp", response[1])
- self.add_metadata("original_tweet_url", encoded_tweet)
- self.add_metadata("archived_tweet_url", encoded_archived_tweet)
- self.add_metadata("parsed_tweet_url", encoded_parsed_tweet)
- self.add_metadata("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
- self.add_metadata("archived_mimetype", response[3])
- self.add_metadata("archived_statuscode", response[4])
- self.add_metadata("archived_digest", response[5])
- self.add_metadata("archived_length", response[6])
+ self.add_field("parsed_tweet_text_mimetype_json", parsed_text_json)
+ self.add_field("archived_urlkey", response[0])
+ self.add_field("archived_timestamp", response[1])
+ self.add_field("original_tweet_url", encoded_tweet)
+ self.add_field("archived_tweet_url", encoded_archived_tweet)
+ self.add_field("parsed_tweet_url", encoded_parsed_tweet)
+ self.add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+ self.add_field("archived_mimetype", response[3])
+ self.add_field("archived_statuscode", response[4])
+ self.add_field("archived_digest", response[5])
+ self.add_field("archived_length", response[6])
def parse(self):
- """Parses the archived tweets metadata and structures it."""
+ """Parses the archived tweets CDX data and structures it."""
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {
class WaybackTweets:
"""Requests data from the Wayback CDX Server API and returns it in JSON format."""
- def __init__(self, username, unique=False, timestamp_from=None, timestamp_to=None):
+ def __init__(self, username, unique, timestamp_from, timestamp_to, limit):
self.username = username
self.unique = unique
self.timestamp_from = timestamp_from
self.timestamp_to = timestamp_to
+ self.limit = limit
- async def get(self):
+ def get(self):
"""GET request to the Internet Archive's CDX API to retrieve archived tweets."""
url = "https://web.archive.org/cdx/search/cdx"
params = {
"url": f"https://twitter.com/{self.username}/status/*",
"output": "json",
- "limit": 1000,
}
if self.unique:
if self.timestamp_to:
params["to"] = self.timestamp_to
- print("Hi, archivist...")
+ if self.limit:
+ params["limit"] = self.limit
+
+ print("Making a request to the Internet Archive...")
try:
- async with httpx.AsyncClient() as client:
- response = await client.get(url, params=params)
+ response = httpx.get(url, params=params)
if not (400 <= response.status_code <= 511):
return response.json()
"[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501
)
except UnboundLocalError as e:
- print(e)
+ rprint(f"[red]{e}")
html += "<br>\n"
html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
- html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_username"]}</p>\n'
+ html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
if tweet["archived_mimetype"] == "application/json":
html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'