use httpx, update prints and update tweets parser
authorClaromes <claromes@hey.com>
Sat, 8 Jun 2024 02:58:55 +0000 (23:58 -0300)
committerClaromes <claromes@hey.com>
Sat, 8 Jun 2024 02:58:55 +0000 (23:58 -0300)
poetry.lock
pyproject.toml
waybacktweets/export_tweets.py
waybacktweets/main.py
waybacktweets/parse_tweets.py
waybacktweets/request_tweets.py

index 57ca31e11890c6cf6bc1578518e67b546055b9a7..e30b1f98491c3de9e8b50cdb7fc8867ce82937c7 100644 (file)
@@ -25,6 +25,28 @@ all = ["altair-tiles (>=0.3.0)", "anywidget (>=0.9.0)", "pyarrow (>=11)", "vega-
 dev = ["geopandas", "hatch", "ipython", "m2r", "mypy", "pandas-stubs", "pytest", "pytest-cov", "ruff (>=0.3.0)", "types-jsonschema", "types-setuptools"]
 doc = ["docutils", "jinja2", "myst-parser", "numpydoc", "pillow (>=9,<10)", "pydata-sphinx-theme (>=0.14.1)", "scipy", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinxext-altair"]
 
+[[package]]
+name = "anyio"
+version = "4.4.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"},
+    {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+trio = ["trio (>=0.23)"]
+
 [[package]]
 name = "attrs"
 version = "23.2.0"
@@ -137,6 +159,70 @@ files = [
     {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
 ]
 
+[[package]]
+name = "cffi"
+version = "1.16.0"
+description = "Foreign Function Interface for Python calling C code."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
+    {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
+    {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
+    {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
+    {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
+    {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
+    {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
+    {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
+    {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
+    {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
+    {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
+    {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
+    {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
+    {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
+]
+
+[package.dependencies]
+pycparser = "*"
+
 [[package]]
 name = "cfgv"
 version = "3.4.0"
@@ -283,6 +369,20 @@ files = [
     {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
 ]
 
+[[package]]
+name = "exceptiongroup"
+version = "1.2.1"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
+    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
 [[package]]
 name = "filelock"
 version = "3.14.0"
@@ -381,6 +481,62 @@ gitdb = ">=4.0.1,<5"
 doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
 test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
 
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.5"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"},
+    {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<0.26.0)"]
+
+[[package]]
+name = "httpx"
+version = "0.27.0"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"},
+    {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+httpcore = "==1.*"
+idna = "*"
+sniffio = "*"
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+
 [[package]]
 name = "identify"
 version = "2.5.36"
@@ -790,6 +946,20 @@ files = [
     {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+description = "Capture the outcome of Python function calls."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
+    {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
+]
+
+[package.dependencies]
+attrs = ">=19.2.0"
+
 [[package]]
 name = "packaging"
 version = "23.2"
@@ -1087,6 +1257,17 @@ files = [
     {file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"},
 ]
 
+[[package]]
+name = "pycparser"
+version = "2.22"
+description = "C parser in Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
+    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
+]
+
 [[package]]
 name = "pydeck"
 version = "0.9.1"
@@ -1520,6 +1701,28 @@ files = [
     {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+    {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+
 [[package]]
 name = "streamlit"
 version = "1.27.0"
@@ -1627,6 +1830,26 @@ files = [
     {file = "tornado-6.4.tar.gz", hash = "sha256:72291fa6e6bc84e626589f1c29d90a5a6d593ef5ae68052ee2ef000dfd273dee"},
 ]
 
+[[package]]
+name = "trio"
+version = "0.25.1"
+description = "A friendly Python library for async concurrency and I/O"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "trio-0.25.1-py3-none-any.whl", hash = "sha256:e42617ba091e7b2e50c899052e83a3c403101841de925187f61e7b7eaebdf3fb"},
+    {file = "trio-0.25.1.tar.gz", hash = "sha256:9f5314f014ea3af489e77b001861c535005c3858d38ec46b6b071ebfa339d7fb"},
+]
+
+[package.dependencies]
+attrs = ">=23.2.0"
+cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+idna = "*"
+outcome = "*"
+sniffio = ">=1.3.0"
+sortedcontainers = "*"
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.1"
@@ -1776,4 +1999,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0"
-content-hash = "6dbea10017dac018837628b065985efe6b4ee98c841beb7076dacaeabec2646e"
+content-hash = "b0a265c676a14faabce90d2227ef14170b89a647f70e7c7c0f381319b0d9840f"
index 73d60484211fca7e02702aa702f5a5038213385c..607125eddb42d59873f0c4682dcaf7127595c24f 100644 (file)
@@ -11,6 +11,8 @@ python = ">=3.9,<3.9.7 || >3.9.7,<4.0"
 requests = "^2.30.0"
 streamlit = "1.27.0"
 rich = "^13.6.0"
+httpx = "^0.27.0"
+trio = "^0.25.1"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.6.0"
index 71d2606a4399a2e7f757900fff9b29912e869a01..b43410dd410a5af5fa4444eb87a417d6c1733f7b 100644 (file)
@@ -3,6 +3,7 @@ import os
 import re
 
 import pandas as pd
+from rich import print as rprint
 from viz_tweets import HTMLTweetsVisualizer
 
 
@@ -59,14 +60,14 @@ class TweetsExporter:
         csv_file_path = f"{self.filename}.csv"
         self.dataframe.to_csv(csv_file_path, index=False)
 
-        print(f"Saved to {csv_file_path}")
+        rprint(f"[blue]Saved to {csv_file_path}")
 
     def save_to_json(self):
         """Saves the DataFrame to a JSON file."""
         json_file_path = f"{self.filename}.json"
         self.dataframe.to_json(json_file_path, orient="records", lines=False)
 
-        print(f"Saved to {json_file_path}")
+        rprint(f"[blue]Saved to {json_file_path}")
 
     def save_to_html(self):
         """Saves the DataFrame to an HTML file."""
@@ -82,4 +83,4 @@ class TweetsExporter:
         html_content = html.generate()
         html.save(html_content)
 
-        print(f"Saved to {html_file_path}")
+        rprint(f"[blue]Saved to {html_file_path}")
index f97586688e01ca269ea4bf407035c5d8260ec3ac..d7d5b8a7aa83adac277e3189b4af7b1bb5726f5b 100644 (file)
@@ -2,26 +2,27 @@
 Main function for retrieving archived tweets.
 """
 
+import trio
 from export_tweets import TweetsExporter
 from parse_tweets import TweetsParser
 from request_tweets import WaybackTweets
 from rich import print as rprint
 
 username = "claromes"
-unique = False
-datetime_from = "2020-01-01"
-datetime_to = "2024-05-31"
+unique = True
+datetime_from = None
+datetime_to = None
 ascending = False
 
 
-def main():
+async def main():
     """
     Invokes the classes to retrieve archived tweets, perform necessary parsing,
     and save the data.
     """
     try:
         api = WaybackTweets(username, unique, datetime_from, datetime_to)
-        archived_tweets = api.get()
+        archived_tweets = await api.get()
 
         if archived_tweets:
             metadata_options = [
@@ -50,8 +51,6 @@ def main():
             exporter.save_to_csv()
             exporter.save_to_json()
             exporter.save_to_html()
-        else:
-            print("Nothing here.")
 
     except TypeError as e:
         print(e)
@@ -62,4 +61,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    trio.run(main)
index 8e70a46aed98f83ea4948bf72b049a9216d30ed1..57985f1efb03323574819d0c1306626ff85492a6 100644 (file)
@@ -1,8 +1,10 @@
 import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import unquote
 
-import requests
-from rich.progress import track
+import httpx
+from rich import print as rprint
+from rich.progress import Progress
 from utils import (
     check_double_status,
     check_pattern_tweet,
@@ -22,15 +24,19 @@ class TwitterEmbed:
         """Parses the archived tweets when they are still available."""
         try:
             url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
-            response = requests.get(url)
+            response = httpx.get(url)
             if not (400 <= response.status_code <= 511):
-                html = response.json()["html"]
-                author_name = response.json()["author_name"]
+                json_response = response.json()
+                html = json_response["html"]
+                author_name = json_response["author_name"]
 
-                regex = r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>'  # noqa: E501
-                regex_author = r"^(.*?)\s*\("
+                regex = re.compile(
+                    r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>',  # noqa
+                    re.DOTALL,
+                )
+                regex_author = re.compile(r"^(.*?)\s*\(")
 
-                matches_html = re.findall(regex, html, re.DOTALL)
+                matches_html = regex.findall(html)
 
                 tweet_content = []
                 user_info = []
@@ -39,29 +45,22 @@ class TwitterEmbed:
                 for match in matches_html:
                     tweet_content_match = re.sub(
                         r"<a[^>]*>|<\/a>", "", match[0].strip()
-                    )
-                    tweet_content_match = tweet_content_match.replace("<br>", "\n")
-
-                    user_info_match = re.sub(r"<a[^>]*>|<\/a>", "", match[1].strip())
-                    user_info_match = user_info_match.replace(")", "), ")
-
-                    match_author = re.search(regex_author, user_info_match)
+                    ).replace("<br>", "\n")
+                    user_info_match = re.sub(
+                        r"<a[^>]*>|<\/a>", "", match[1].strip()
+                    ).replace(")", "), ")
+                    match_author = regex_author.search(user_info_match)
                     author_tweet = match_author.group(1) if match_author else ""
 
                     if tweet_content_match:
                         tweet_content.append(tweet_content_match)
                     if user_info_match:
                         user_info.append(user_info_match)
-
-                        is_RT_match = False
-                        if author_name != author_tweet:
-                            is_RT_match = True
-
-                        is_RT.append(is_RT_match)
+                        is_RT.append(author_name != author_tweet)
 
                 return tweet_content, is_RT, user_info
-        except Exception as e:
-            print(f"Error parsing tweet: {e}")
+        except Exception:
+            rprint("[yellow]Error parsing the tweet, but the metadata was saved.")
             return None
 
 
@@ -74,21 +73,26 @@ class JsonParser:
     def parse(self):
         """Parses the archived tweets in JSON format."""
         try:
-            response = requests.get(self.archived_tweet_url)
-            if not (400 <= response.status_code <= 511):
+            response = httpx.get(self.archived_tweet_url)
+
+            if response and not (400 <= response.status_code <= 511):
                 json_data = response.json()
 
                 if "data" in json_data:
                     return json_data["data"].get("text", json_data["data"])
-                elif "retweeted_status" in json_data:
+
+                if "retweeted_status" in json_data:
                     return json_data["retweeted_status"].get(
                         "text", json_data["retweeted_status"]
                     )
-                else:
-                    return json_data.get("text", json_data)
-        except Exception as e:
-            print(f"Error parsing JSON mimetype tweet: {e}")
-            return None
+
+                return json_data.get("text", json_data)
+        except Exception:
+            rprint(
+                f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the metadata was saved."  # noqa: E501
+            )
+
+            return ""
 
 
 class TweetsParser:
@@ -108,71 +112,87 @@ class TweetsParser:
         if key in self.parsed_tweets:
             self.parsed_tweets[key].append(value)
 
-    def parse(self):
-        """Parses the archived tweets metadata and structures it."""
-        for response in track(
-            self.archived_tweets_response[1:],
-            description=f"Wayback @{self.username} tweets\n",
-        ):
-            tweet_remove_char = unquote(response[2]).replace("’", "")
-            cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
-
-            wayback_machine_url = (
-                f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
-            )
+    def process_response(self, response):
+        """Process the archived tweet's response and add the relevant metadata."""
+        tweet_remove_char = unquote(response[2]).replace("’", "")
+        cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
+
+        wayback_machine_url = (
+            f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
+        )
+        original_tweet = delete_tweet_pathnames(
+            clean_tweet_url(cleaned_tweet, self.username)
+        )
+        parsed_wayback_machine_url = (
+            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+        )
+
+        double_status = check_double_status(wayback_machine_url, original_tweet)
+
+        if double_status:
             original_tweet = delete_tweet_pathnames(
-                clean_tweet_url(cleaned_tweet, self.username)
+                f"https://twitter.com/{original_tweet}"
             )
-            parsed_wayback_machine_url = (
-                f"https://web.archive.org/web/{response[1]}/{original_tweet}"
-            )
-
-            double_status = check_double_status(wayback_machine_url, original_tweet)
+        elif "://" not in original_tweet:
+            original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
 
-            if double_status:
-                original_tweet = delete_tweet_pathnames(
-                    f"https://twitter.com/{original_tweet}"
-                )
-            elif "://" not in original_tweet:
-                original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
+        encoded_tweet = semicolon_parser(response[2])
+        encoded_archived_tweet = semicolon_parser(wayback_machine_url)
+        encoded_parsed_tweet = semicolon_parser(original_tweet)
+        encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
 
-            encoded_tweet = semicolon_parser(response[2])
-            encoded_archived_tweet = semicolon_parser(wayback_machine_url)
-            encoded_parsed_tweet = semicolon_parser(original_tweet)
-            encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+        embed_parser = TwitterEmbed(encoded_tweet)
+        content = embed_parser.embed()
 
-            embed_parser = TwitterEmbed(encoded_tweet)
-            content = embed_parser.embed()
+        if content:
+            self.add_metadata("available_tweet_text", semicolon_parser(content[0][0]))
+            self.add_metadata("available_tweet_is_RT", content[1][0])
+            self.add_metadata(
+                "available_tweet_username", semicolon_parser(content[2][0])
+            )
 
-            if content:
-                self.add_metadata(
-                    "available_tweet_text", semicolon_parser(content[0][0])
-                )
-                self.add_metadata("available_tweet_is_RT", content[1][0])
-                self.add_metadata(
-                    "available_tweet_username", semicolon_parser(content[2][0])
-                )
+        parsed_text_json = ""
 
-            if response[3] == "application/json":
-                json_parser = JsonParser(encoded_archived_tweet)
+        if response[3] == "application/json":
+            json_parser = JsonParser(encoded_archived_tweet)
+            if json_parser:
                 text_json = json_parser.parse()
                 parsed_text_json = semicolon_parser(text_json)
-            else:
-                parsed_text_json = None
 
-            self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json)
+        self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json)
+        self.add_metadata("archived_urlkey", response[0])
+        self.add_metadata("archived_timestamp", response[1])
+        self.add_metadata("original_tweet_url", encoded_tweet)
+        self.add_metadata("archived_tweet_url", encoded_archived_tweet)
+        self.add_metadata("parsed_tweet_url", encoded_parsed_tweet)
+        self.add_metadata("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+        self.add_metadata("archived_mimetype", response[3])
+        self.add_metadata("archived_statuscode", response[4])
+        self.add_metadata("archived_digest", response[5])
+        self.add_metadata("archived_length", response[6])
 
-            self.add_metadata("archived_urlkey", response[0])
-            self.add_metadata("archived_timestamp", response[1])
-            self.add_metadata("original_tweet_url", encoded_tweet)
-            self.add_metadata("archived_tweet_url", encoded_archived_tweet)
-            self.add_metadata("parsed_tweet_url", encoded_parsed_tweet)
-            self.add_metadata(
-                "parsed_archived_tweet_url", encoded_parsed_archived_tweet
-            )
-            self.add_metadata("archived_mimetype", response[3])
-            self.add_metadata("archived_statuscode", response[4])
-            self.add_metadata("archived_digest", response[5])
-            self.add_metadata("archived_length", response[6])
+    def parse(self):
+        """Parses the archived tweets metadata and structures it."""
+        with ThreadPoolExecutor(max_workers=10) as executor:
+
+            futures = {
+                executor.submit(self.process_response, response): response
+                for response in self.archived_tweets_response[1:]
+            }
+            with Progress() as progress:
+                task = progress.add_task(
+                    f"Waybacking @{self.username} tweets\n", total=len(futures)
+                )
+
+                for future in as_completed(futures):
+                    try:
+                        with httpx.Client(timeout=60.0):
+                            future.result()
+                    except httpx.RequestError as e:
+                        rprint(f"[red]{e}")
+                    except Exception as e:
+                        rprint(f"[red]{e}")
+
+                    progress.update(task, advance=1)
 
-        return self.parsed_tweets
+            return self.parsed_tweets
index c419191708753be21191678c25834fbc2974e06d..97c0c13ba4ddcfd1e72dd16d7eaa506c2034f463 100644 (file)
@@ -1,42 +1,49 @@
-import requests
+import httpx
+from rich import print as rprint
 
 
 class WaybackTweets:
     """Requests data from the Wayback CDX Server API and returns it in JSON format."""
 
-    def __init__(self, username, unique=False, timestamp_from="", timestamp_to=""):
+    def __init__(self, username, unique=False, timestamp_from=None, timestamp_to=None):
         self.username = username
         self.unique = unique
         self.timestamp_from = timestamp_from
         self.timestamp_to = timestamp_to
 
-    def get(self):
-        unique_param = "&collapse=urlkey" if self.unique else ""
-        timestamp_from_param = (
-            f"&from={self.timestamp_from}" if self.timestamp_from else ""
-        )
-        timestamp_to_param = f"&to={self.timestamp_to}" if self.timestamp_to else ""
-
-        url = (
-            f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{self.username}/status/*"  # noqa: E501
-            f"&output=json{unique_param}{timestamp_from_param}{timestamp_to_param}&limit=20"  # noqa: E501
-        )
+    async def get(self):
+        """GET request to the Internet Archive's CDX API to retrieve archived tweets."""
+        url = "https://web.archive.org/cdx/search/cdx"
+        params = {
+            "url": f"https://twitter.com/{self.username}/status/*",
+            "output": "json",
+            "limit": 1000,
+        }
+
+        if self.unique:
+            params["collapse"] = "urlkey"
+
+        if self.timestamp_from:
+            params["from"] = self.timestamp_from
+
+        if self.timestamp_to:
+            params["to"] = self.timestamp_to
+
         print("Hi, archivist...")
 
         try:
-            response = requests.get(url)
-            response.raise_for_status()
+            async with httpx.AsyncClient() as client:
+                response = await client.get(url, params=params)
 
             if not (400 <= response.status_code <= 511):
                 return response.json()
-        except requests.exceptions.Timeout as e:
-            print(f"{e}.\nConnection to web.archive.org timed out.")
-        except requests.exceptions.ConnectionError as e:
-            print(f"{e}.\nFailed to establish a new connection with web.archive.org.")
-        except requests.exceptions.HTTPError as e:
-            print(
-                f"{e}.\nTemporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information."  # noqa: E501
+        except httpx._exceptions.ReadTimeout:
+            rprint("[red]Connection to web.archive.org timed out.")
+        except httpx._exceptions.ConnectError:
+            rprint("[red]Failed to establish a new connection with web.archive.org.")
+        except httpx._exceptions.HTTPError:
+            rprint(
+                "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information."  # noqa: E501
             )
         except UnboundLocalError as e:
             print(e)
-        return None