add cli

author Claromes <claromes@hey.com>

Sat, 8 Jun 2024 17:36:36 +0000 (14:36 -0300)

committer Claromes <claromes@hey.com>

Sat, 8 Jun 2024 17:36:36 +0000 (14:36 -0300)
author Claromes <claromes@hey.com>
Sat, 8 Jun 2024 17:36:36 +0000 (14:36 -0300)
committer Claromes <claromes@hey.com>
Sat, 8 Jun 2024 17:36:36 +0000 (14:36 -0300)
diff --git a/docs/index.md b/docs/index.md

index 93adca936ab453859336aa23dc2493ea268a794b..0c0afb00c376c6c9e45254832db3ce7ba02b6964 100644 (file)
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,4 +2,4 @@
  
  # Wayback Tweets
  
-Commands and Python functions for retrieving archived tweets
+Retrieves archived tweets' CDX data from the Wayback Machine, performs necessary parsing, and saves the data.
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css

index 68a1a002312b5a122f9a20e5fbf360af91799c3f..93f5b9e168e3150a55909252ba97bb80ce40cb45 100644 (file)
--- a/docs/stylesheets/extra.css
+++ b/docs/stylesheets/extra.css
@@ -15,6 +15,10 @@
      max-height: none;
  }
  
+.magnifyingglass {
+    scale: 1.2;
+}
+
  .md-source__icon svg {
      scale: 1.5;
  }
diff --git a/mkdocs.yml b/mkdocs.yml

index ca3905ea59d22b52b7bb6bd8bd0b753c4291dd02..d744b2c553b7c4ddab4aabfca5810b1ecf1f27b4 100644 (file)
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -3,6 +3,7 @@ repo_url: https://github.com/claromes/waybacktweets/
  repo_name: claromes/waybacktweets
  edit_uri: tree/main/docs
  site_author: Claromes
+site_description: Retrieves archived tweets' CDX data from the Wayback Machine
  copyright: Copyright &copy; 2023 - 2024 Claromes · Icons by TheDoodleLibrary
  
  theme:
diff --git a/poetry.lock b/poetry.lock

index 5b3859e0191e4ba861028729ffee1a762a007db9..ae31e45025446ad2226fcd23e9a3dedfdddaacef 100644 (file)
--- a/poetry.lock
+++ b/poetry.lock
@@ -159,70 +159,6 @@ files = [
      {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
  ]
  
-[[package]]
-name = "cffi"
-version = "1.16.0"
-description = "Foreign Function Interface for Python calling C code."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
-    {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
-    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
-    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
-    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
-    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
-    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
-    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
-    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
-    {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
-    {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
-    {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
-    {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
-    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
-    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
-    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
-    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
-    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
-    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
-    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
-    {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
-    {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
-    {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
-    {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
-    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
-    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
-    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
-    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
-    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
-    {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
-    {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
-    {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
-    {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
-    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
-    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
-    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
-    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
-    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
-    {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
-    {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
-    {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
-    {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
-    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
-    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
-    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
-    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
-    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
-    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
-    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
-    {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
-    {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
-    {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
-]
-
-[package.dependencies]
-pycparser = "*"
-
  [[package]]
  name = "cfgv"
  version = "3.4.0"
@@ -946,20 +882,6 @@ files = [
      {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
  ]
  
-[[package]]
-name = "outcome"
-version = "1.3.0.post0"
-description = "Capture the outcome of Python function calls."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
-    {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
-]
-
-[package.dependencies]
-attrs = ">=19.2.0"
-
  [[package]]
  name = "packaging"
  version = "23.2"
@@ -1257,17 +1179,6 @@ files = [
      {file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"},
  ]
  
-[[package]]
-name = "pycparser"
-version = "2.22"
-description = "C parser in Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
-    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
-]
-
  [[package]]
  name = "pydeck"
  version = "0.9.1"
@@ -1712,17 +1623,6 @@ files = [
      {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
  ]
  
-[[package]]
-name = "sortedcontainers"
-version = "2.4.0"
-description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
-optional = false
-python-versions = "*"
-files = [
-    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
-    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
-]
-
  [[package]]
  name = "streamlit"
  version = "1.27.0"
@@ -1830,26 +1730,6 @@ files = [
      {file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"},
  ]
  
-[[package]]
-name = "trio"
-version = "0.25.1"
-description = "A friendly Python library for async concurrency and I/O"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "trio-0.25.1-py3-none-any.whl", hash = "sha256:e42617ba091e7b2e50c899052e83a3c403101841de925187f61e7b7eaebdf3fb"},
-    {file = "trio-0.25.1.tar.gz", hash = "sha256:9f5314f014ea3af489e77b001861c535005c3858d38ec46b6b071ebfa339d7fb"},
-]
-
-[package.dependencies]
-attrs = ">=23.2.0"
-cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
-exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
-idna = "*"
-outcome = "*"
-sniffio = ">=1.3.0"
-sortedcontainers = "*"
-
  [[package]]
  name = "typing-extensions"
  version = "4.12.2"
@@ -1999,4 +1879,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
  [metadata]
  lock-version = "2.0"
  python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0"
-content-hash = "b0a265c676a14faabce90d2227ef14170b89a647f70e7c7c0f381319b0d9840f"
+content-hash = "42b006d1fdee1ed5cf06d63c01ef2f8b4fa94839a0ebf52a9e16d3e85c4ed202"
diff --git a/pyproject.toml b/pyproject.toml

index 607125eddb42d59873f0c4682dcaf7127595c24f..05bf06331fddf76fb5c41af752867ae6c2333266 100644 (file)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
  [tool.poetry]
  name = "waybacktweets"
  version = "1.0"
-description = ""
+description = "Retrieves archived tweets' CDX data from the Wayback Machine"
  authors = ["Claromes <support@claromes.com>"]
  license = "GPL-3.0"
  readme = "README.md"
@@ -12,7 +12,7 @@ requests = "^2.30.0"
  streamlit = "1.27.0"
  rich = "^13.6.0"
  httpx = "^0.27.0"
-trio = "^0.25.1"
+click = "^8.1.7"
  
  [tool.poetry.group.docs.dependencies]
  mkdocs = "^1.6.0"
@@ -32,6 +32,9 @@ profile = "black"
  max-line-length = 88
  extend-ignore = ["E203", "E701"]
  
+[tool.poetry.scripts]
+wbt = 'waybacktweets.cli:cli'
+
  [build-system]
  requires = ["poetry-core"]
  build-backend = "poetry.core.masonry.api"
diff --git a/waybacktweets/cli.py b/waybacktweets/cli.py

new file mode 100644 (file)

index 0000000..ebaebc8
--- /dev/null
+++ b/waybacktweets/cli.py
@@ -0,0 +1,91 @@
+"""
+CLI functions for retrieving archived tweets.
+"""
+
+from datetime import datetime
+
+import click
+from rich import print as rprint
+
+from waybacktweets.export_tweets import TweetsExporter
+from waybacktweets.parse_tweets import TweetsParser
+from waybacktweets.request_tweets import WaybackTweets
+
+
+def parse_date(ctx, param, value):
+    if value is None:
+        return None
+
+    date = datetime.strptime(value, "%Y%m%d")
+    return date.strftime("%Y%m%d")
+
+
+@click.command()
+@click.argument("username", type=str)
+@click.option(
+    "--unique",
+    type=bool,
+    default=False,
+    help="Only show unique URLs. Filtering by the collapse option using the urlkey field.",  # noqa: E501
+)
+@click.option(
+    "--from",
+    "timestamp_from",
+    type=click.UNPROCESSED,
+    callback=parse_date,
+    default=None,
+    help="Filtering by date range from this date.",
+)
+@click.option(
+    "--to",
+    "timestamp_to",
+    type=click.UNPROCESSED,
+    callback=parse_date,
+    default=None,
+    help="Filtering by date range up to this date.",
+)
+@click.option("--limit", type=int, default=None, help="Query result limits.")
+def cli(username, unique, timestamp_from, timestamp_to, limit):
+    """
+    Retrieves archived tweets' CDX data from the Wayback Machine,
+    performs necessary parsing, and saves the data.
+
+    USERNAME: The Twitter username without @.
+    """
+    try:
+        api = WaybackTweets(username, unique, timestamp_from, timestamp_to, limit)
+        archived_tweets = api.get()
+
+        if archived_tweets:
+            field_options = [
+                "archived_urlkey",
+                "archived_timestamp",
+                "original_tweet_url",
+                "archived_tweet_url",
+                "parsed_tweet_url",
+                "parsed_archived_tweet_url",
+                "parsed_tweet_text_mimetype_json",
+                "available_tweet_text",
+                "available_tweet_is_RT",
+                "available_tweet_info",
+                "archived_mimetype",
+                "archived_statuscode",
+                "archived_digest",
+                "archived_length",
+            ]
+
+            parser = TweetsParser(archived_tweets, username, field_options)
+            parsed_tweets = parser.parse()
+
+            exporter = TweetsExporter(parsed_tweets, username, field_options)
+
+            exporter.save_to_csv()
+            exporter.save_to_json()
+            exporter.save_to_html()
+
+    except TypeError as e:
+        rprint(f"[red]{e}")
+    finally:
+        rprint(
+            "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues"  # noqa: E501
+        )
diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py

index b43410dd410a5af5fa4444eb87a417d6c1733f7b..a62204f47d6c20cd12d051a3a7e3e7ad207b18c6 100644 (file)
--- a/waybacktweets/export_tweets.py
+++ b/waybacktweets/export_tweets.py
@@ -3,18 +3,17 @@ import os
  import re
  
  import pandas as pd
-from rich import print as rprint
-from viz_tweets import HTMLTweetsVisualizer
+
+from waybacktweets.viz_tweets import HTMLTweetsVisualizer
  
  
  class TweetsExporter:
      """Handles the exporting of parsed archived tweets."""
  
-    def __init__(self, data, username, metadata_options, ascending):
+    def __init__(self, data, username, field_options):
          self.data = data
          self.username = username
-        self.metadata_options = metadata_options
-        self.ascending = ascending
+        self.field_options = field_options
          self.formatted_datetime = self.datetime_now()
          self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
          self.dataframe = self.create_dataframe(self)
@@ -50,8 +49,7 @@ class TweetsExporter:
          """Creates a DataFrame from the transposed data."""
          data_transposed = self.transpose_matrix(self.data)
  
-        df = pd.DataFrame(data_transposed, columns=self.metadata_options)
-        df = df.sort_values(by="archived_timestamp", ascending=self.ascending)
+        df = pd.DataFrame(data_transposed, columns=self.field_options)
  
          return df
  
@@ -60,14 +58,14 @@ class TweetsExporter:
          csv_file_path = f"{self.filename}.csv"
          self.dataframe.to_csv(csv_file_path, index=False)
  
-        rprint(f"[blue]Saved to {csv_file_path}")
+        print(f"Saved to {csv_file_path}")
  
      def save_to_json(self):
          """Saves the DataFrame to a JSON file."""
          json_file_path = f"{self.filename}.json"
          self.dataframe.to_json(json_file_path, orient="records", lines=False)
  
-        rprint(f"[blue]Saved to {json_file_path}")
+        print(f"Saved to {json_file_path}")
  
      def save_to_html(self):
          """Saves the DataFrame to an HTML file."""
@@ -83,4 +81,4 @@ class TweetsExporter:
          html_content = html.generate()
          html.save(html_content)
  
-        rprint(f"[blue]Saved to {html_file_path}")
+        print(f"Saved to {html_file_path}")
diff --git a/waybacktweets/main.py b/waybacktweets/main.py

deleted file mode 100644 (file)

index d7d5b8a..0000000
--- a/waybacktweets/main.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""
-Main function for retrieving archived tweets.
-"""
-
-import trio
-from export_tweets import TweetsExporter
-from parse_tweets import TweetsParser
-from request_tweets import WaybackTweets
-from rich import print as rprint
-
-username = "claromes"
-unique = True
-datetime_from = None
-datetime_to = None
-ascending = False
-
-
-async def main():
-    """
-    Invokes the classes to retrieve archived tweets, perform necessary parsing,
-    and save the data.
-    """
-    try:
-        api = WaybackTweets(username, unique, datetime_from, datetime_to)
-        archived_tweets = await api.get()
-
-        if archived_tweets:
-            metadata_options = [
-                "archived_urlkey",
-                "archived_timestamp",
-                "original_tweet_url",
-                "archived_tweet_url",
-                "parsed_tweet_url",
-                "parsed_archived_tweet_url",
-                "parsed_tweet_text_mimetype_json",
-                "available_tweet_text",
-                "available_tweet_is_RT",
-                "available_tweet_username",
-                "archived_mimetype",
-                "archived_statuscode",
-                "archived_digest",
-                "archived_length",
-            ]
-
-            parser = TweetsParser(archived_tweets, username, metadata_options)
-            parsed_tweets = parser.parse()
-
-            exporter = TweetsExporter(
-                parsed_tweets, username, metadata_options, ascending
-            )
-            exporter.save_to_csv()
-            exporter.save_to_json()
-            exporter.save_to_html()
-
-    except TypeError as e:
-        print(e)
-    finally:
-        rprint(
-            "[yellow]\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues"  # noqa: E501
-        )
-
-
-if __name__ == "__main__":
-    trio.run(main)
diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py

index 57985f1efb03323574819d0c1306626ff85492a6..6e4c2a3f2ae7dea8f84085c65c1609687c5a99da 100644 (file)
--- a/waybacktweets/parse_tweets.py
+++ b/waybacktweets/parse_tweets.py
@@ -5,7 +5,8 @@ from urllib.parse import unquote
  import httpx
  from rich import print as rprint
  from rich.progress import Progress
-from utils import (
+
+from waybacktweets.utils import (
      check_double_status,
      check_pattern_tweet,
      clean_tweet_url,
@@ -60,7 +61,7 @@ class TwitterEmbed:
  
                  return tweet_content, is_RT, user_info
          except Exception:
-            rprint("[yellow]Error parsing the tweet, but the metadata was saved.")
+            rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
              return None
  
  
@@ -89,7 +90,7 @@ class JsonParser:
                  return json_data.get("text", json_data)
          except Exception:
              rprint(
-                f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the metadata was saved."  # noqa: E501
+                f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the CDX data was saved."  # noqa: E501
              )
  
              return ""
@@ -98,13 +99,13 @@ class JsonParser:
  class TweetsParser:
      """Handles the overall parsing of archived tweets."""
  
-    def __init__(self, archived_tweets_response, username, metadata_options):
+    def __init__(self, archived_tweets_response, username, field_options):
          self.archived_tweets_response = archived_tweets_response
          self.username = username
-        self.metadata_options = metadata_options
-        self.parsed_tweets = {option: [] for option in self.metadata_options}
+        self.field_options = field_options
+        self.parsed_tweets = {option: [] for option in self.field_options}
  
-    def add_metadata(self, key, value):
+    def add_field(self, key, value):
          """
          Appends a value to a list in the parsed data structure.
          Defines which data will be structured and saved.
@@ -113,7 +114,7 @@ class TweetsParser:
              self.parsed_tweets[key].append(value)
  
      def process_response(self, response):
-        """Process the archived tweet's response and add the relevant metadata."""
+        """Process the archived tweet's response and add the relevant CDX data."""
          tweet_remove_char = unquote(response[2]).replace("’", "")
          cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
  
@@ -145,11 +146,9 @@ class TweetsParser:
          content = embed_parser.embed()
  
          if content:
-            self.add_metadata("available_tweet_text", semicolon_parser(content[0][0]))
-            self.add_metadata("available_tweet_is_RT", content[1][0])
-            self.add_metadata(
-                "available_tweet_username", semicolon_parser(content[2][0])
-            )
+            self.add_field("available_tweet_text", semicolon_parser(content[0][0]))
+            self.add_field("available_tweet_is_RT", content[1][0])
+            self.add_field("available_tweet_info", semicolon_parser(content[2][0]))
  
          parsed_text_json = ""
  
@@ -159,20 +158,20 @@ class TweetsParser:
                  text_json = json_parser.parse()
                  parsed_text_json = semicolon_parser(text_json)
  
-        self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json)
-        self.add_metadata("archived_urlkey", response[0])
-        self.add_metadata("archived_timestamp", response[1])
-        self.add_metadata("original_tweet_url", encoded_tweet)
-        self.add_metadata("archived_tweet_url", encoded_archived_tweet)
-        self.add_metadata("parsed_tweet_url", encoded_parsed_tweet)
-        self.add_metadata("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
-        self.add_metadata("archived_mimetype", response[3])
-        self.add_metadata("archived_statuscode", response[4])
-        self.add_metadata("archived_digest", response[5])
-        self.add_metadata("archived_length", response[6])
+        self.add_field("parsed_tweet_text_mimetype_json", parsed_text_json)
+        self.add_field("archived_urlkey", response[0])
+        self.add_field("archived_timestamp", response[1])
+        self.add_field("original_tweet_url", encoded_tweet)
+        self.add_field("archived_tweet_url", encoded_archived_tweet)
+        self.add_field("parsed_tweet_url", encoded_parsed_tweet)
+        self.add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+        self.add_field("archived_mimetype", response[3])
+        self.add_field("archived_statuscode", response[4])
+        self.add_field("archived_digest", response[5])
+        self.add_field("archived_length", response[6])
  
      def parse(self):
-        """Parses the archived tweets metadata and structures it."""
+        """Parses the archived tweets CDX data and structures it."""
          with ThreadPoolExecutor(max_workers=10) as executor:
  
              futures = {
diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py

index 97c0c13ba4ddcfd1e72dd16d7eaa506c2034f463..78ba5a245dc412295152cdcaa9e61ca99a31d50b 100644 (file)
--- a/waybacktweets/request_tweets.py
+++ b/waybacktweets/request_tweets.py
@@ -5,19 +5,19 @@ from rich import print as rprint
  class WaybackTweets:
      """Requests data from the Wayback CDX Server API and returns it in JSON format."""
  
-    def __init__(self, username, unique=False, timestamp_from=None, timestamp_to=None):
+    def __init__(self, username, unique, timestamp_from, timestamp_to, limit):
          self.username = username
          self.unique = unique
          self.timestamp_from = timestamp_from
          self.timestamp_to = timestamp_to
+        self.limit = limit
  
-    async def get(self):
+    def get(self):
          """GET request to the Internet Archive's CDX API to retrieve archived tweets."""
          url = "https://web.archive.org/cdx/search/cdx"
          params = {
              "url": f"https://twitter.com/{self.username}/status/*",
              "output": "json",
-            "limit": 1000,
          }
  
          if self.unique:
@@ -29,11 +29,13 @@ class WaybackTweets:
          if self.timestamp_to:
              params["to"] = self.timestamp_to
  
-        print("Hi, archivist...")
+        if self.limit:
+            params["limit"] = self.limit
+
+        print("Making a request to the Internet Archive...")
  
          try:
-            async with httpx.AsyncClient() as client:
-                response = await client.get(url, params=params)
+            response = httpx.get(url, params=params)
  
              if not (400 <= response.status_code <= 511):
                  return response.json()
@@ -46,4 +48,4 @@ class WaybackTweets:
                  "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information."  # noqa: E501
              )
          except UnboundLocalError as e:
-            print(e)
+            rprint(f"[red]{e}")
diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py

index f394e9bb0dbc1f124da49e2a8ed78161b2a3b19c..1e803faba680415a86b289b2007b8903a8d839ae 100644 (file)
--- a/waybacktweets/viz_tweets.py
+++ b/waybacktweets/viz_tweets.py
@@ -52,7 +52,7 @@ class HTMLTweetsVisualizer:
                  html += "<br>\n"
                  html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
                  html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
-                html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_username"]}</p>\n'
+                html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
  
              if tweet["archived_mimetype"] == "application/json":
                  html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'
author	Claromes <claromes@hey.com>
	Sat, 8 Jun 2024 17:36:36 +0000 (14:36 -0300)
committer	Claromes <claromes@hey.com>
	Sat, 8 Jun 2024 17:36:36 +0000 (14:36 -0300)
docs/index.md		patch \| blob \| history
docs/stylesheets/extra.css		patch \| blob \| history
mkdocs.yml		patch \| blob \| history
poetry.lock		patch \| blob \| history
pyproject.toml		patch \| blob \| history
waybacktweets/cli.py	[new file with mode: 0644]	patch \| blob
waybacktweets/export_tweets.py		patch \| blob \| history
waybacktweets/main.py	[deleted file]	patch \| blob \| history
waybacktweets/parse_tweets.py		patch \| blob \| history
waybacktweets/request_tweets.py		patch \| blob \| history
waybacktweets/viz_tweets.py		patch \| blob \| history