dev = ["geopandas", "hatch", "ipython", "m2r", "mypy", "pandas-stubs", "pytest", "pytest-cov", "ruff (>=0.3.0)", "types-jsonschema", "types-setuptools"]
doc = ["docutils", "jinja2", "myst-parser", "numpydoc", "pillow (>=9,<10)", "pydata-sphinx-theme (>=0.14.1)", "scipy", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinxext-altair"]
+[[package]]
+name = "anyio"
+version = "4.4.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"},
+ {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+trio = ["trio (>=0.23)"]
+
[[package]]
name = "attrs"
version = "23.2.0"
{file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
]
+[[package]]
+name = "cffi"
+version = "1.16.0"
+description = "Foreign Function Interface for Python calling C code."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
+ {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
+ {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
+ {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
+ {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
+ {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
+ {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
+ {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
+ {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
+ {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
+ {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
+ {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
+ {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
+ {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
+ {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
+ {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
+ {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
+ {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
+ {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
+ {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
+ {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
+ {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
+ {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
+ {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
+ {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
+ {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
+ {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
+ {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
+ {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
+ {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
+ {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
+ {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
+ {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
+ {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
+ {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
+ {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
+ {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
+ {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
+ {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
+ {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
+ {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
+ {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
+ {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
+ {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
+ {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
+ {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
+ {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
+ {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
+ {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
+ {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
+ {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
+ {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
+]
+
+[package.dependencies]
+pycparser = "*"
+
[[package]]
name = "cfgv"
version = "3.4.0"
{file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
]
+[[package]]
+name = "exceptiongroup"
+version = "1.2.1"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
+ {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
[[package]]
name = "filelock"
version = "3.14.0"
doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.5"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"},
+ {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<0.26.0)"]
+
+[[package]]
+name = "httpx"
+version = "0.27.0"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"},
+ {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+httpcore = "==1.*"
+idna = "*"
+sniffio = "*"
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+
[[package]]
name = "identify"
version = "2.5.36"
{file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
]
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+description = "Capture the outcome of Python function calls."
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
+ {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
+]
+
+[package.dependencies]
+attrs = ">=19.2.0"
+
[[package]]
name = "packaging"
version = "23.2"
{file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"},
]
+[[package]]
+name = "pycparser"
+version = "2.22"
+description = "C parser in Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
+ {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
+]
+
[[package]]
name = "pydeck"
version = "0.9.1"
{file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
]
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+optional = false
+python-versions = "*"
+files = [
+ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+
[[package]]
name = "streamlit"
version = "1.27.0"
{file = "tornado-6.4.tar.gz", hash = "sha256:72291fa6e6bc84e626589f1c29d90a5a6d593ef5ae68052ee2ef000dfd273dee"},
]
+[[package]]
+name = "trio"
+version = "0.25.1"
+description = "A friendly Python library for async concurrency and I/O"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "trio-0.25.1-py3-none-any.whl", hash = "sha256:e42617ba091e7b2e50c899052e83a3c403101841de925187f61e7b7eaebdf3fb"},
+ {file = "trio-0.25.1.tar.gz", hash = "sha256:9f5314f014ea3af489e77b001861c535005c3858d38ec46b6b071ebfa339d7fb"},
+]
+
+[package.dependencies]
+attrs = ">=23.2.0"
+cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+idna = "*"
+outcome = "*"
+sniffio = ">=1.3.0"
+sortedcontainers = "*"
+
[[package]]
name = "typing-extensions"
version = "4.12.1"
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0"
-content-hash = "6dbea10017dac018837628b065985efe6b4ee98c841beb7076dacaeabec2646e"
+content-hash = "b0a265c676a14faabce90d2227ef14170b89a647f70e7c7c0f381319b0d9840f"
requests = "^2.30.0"
streamlit = "1.27.0"
rich = "^13.6.0"
+httpx = "^0.27.0"
+trio = "^0.25.1"
[tool.poetry.group.docs.dependencies]
mkdocs = "^1.6.0"
import re
import pandas as pd
+from rich import print as rprint
from viz_tweets import HTMLTweetsVisualizer
csv_file_path = f"{self.filename}.csv"
self.dataframe.to_csv(csv_file_path, index=False)
- print(f"Saved to {csv_file_path}")
+ rprint(f"[blue]Saved to {csv_file_path}")
def save_to_json(self):
"""Saves the DataFrame to a JSON file."""
json_file_path = f"{self.filename}.json"
self.dataframe.to_json(json_file_path, orient="records", lines=False)
- print(f"Saved to {json_file_path}")
+ rprint(f"[blue]Saved to {json_file_path}")
def save_to_html(self):
"""Saves the DataFrame to an HTML file."""
html_content = html.generate()
html.save(html_content)
- print(f"Saved to {html_file_path}")
+ rprint(f"[blue]Saved to {html_file_path}")
Main function for retrieving archived tweets.
"""
+import trio
from export_tweets import TweetsExporter
from parse_tweets import TweetsParser
from request_tweets import WaybackTweets
from rich import print as rprint
username = "claromes"
-unique = False
-datetime_from = "2020-01-01"
-datetime_to = "2024-05-31"
+unique = True
+datetime_from = None
+datetime_to = None
ascending = False
-def main():
+async def main():
"""
Invokes the classes to retrieve archived tweets, perform necessary parsing,
and save the data.
"""
try:
api = WaybackTweets(username, unique, datetime_from, datetime_to)
- archived_tweets = api.get()
+ archived_tweets = await api.get()
if archived_tweets:
metadata_options = [
exporter.save_to_csv()
exporter.save_to_json()
exporter.save_to_html()
- else:
- print("Nothing here.")
except TypeError as e:
print(e)
if __name__ == "__main__":
- main()
+ trio.run(main)
import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import unquote
-import requests
-from rich.progress import track
+import httpx
+from rich import print as rprint
+from rich.progress import Progress
from utils import (
check_double_status,
check_pattern_tweet,
"""Parses the archived tweets when they are still available."""
try:
url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
- response = requests.get(url)
+ response = httpx.get(url)
if not (400 <= response.status_code <= 511):
- html = response.json()["html"]
- author_name = response.json()["author_name"]
+ json_response = response.json()
+ html = json_response["html"]
+ author_name = json_response["author_name"]
- regex = r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>' # noqa: E501
- regex_author = r"^(.*?)\s*\("
+ regex = re.compile(
+ r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa
+ re.DOTALL,
+ )
+ regex_author = re.compile(r"^(.*?)\s*\(")
- matches_html = re.findall(regex, html, re.DOTALL)
+ matches_html = regex.findall(html)
tweet_content = []
user_info = []
for match in matches_html:
tweet_content_match = re.sub(
r"<a[^>]*>|<\/a>", "", match[0].strip()
- )
- tweet_content_match = tweet_content_match.replace("<br>", "\n")
-
- user_info_match = re.sub(r"<a[^>]*>|<\/a>", "", match[1].strip())
- user_info_match = user_info_match.replace(")", "), ")
-
- match_author = re.search(regex_author, user_info_match)
+ ).replace("<br>", "\n")
+ user_info_match = re.sub(
+ r"<a[^>]*>|<\/a>", "", match[1].strip()
+ ).replace(")", "), ")
+ match_author = regex_author.search(user_info_match)
author_tweet = match_author.group(1) if match_author else ""
if tweet_content_match:
tweet_content.append(tweet_content_match)
if user_info_match:
user_info.append(user_info_match)
-
- is_RT_match = False
- if author_name != author_tweet:
- is_RT_match = True
-
- is_RT.append(is_RT_match)
+ is_RT.append(author_name != author_tweet)
return tweet_content, is_RT, user_info
- except Exception as e:
- print(f"Error parsing tweet: {e}")
+ except Exception:
+ rprint("[yellow]Error parsing the tweet, but the metadata was saved.")
return None
def parse(self):
"""Parses the archived tweets in JSON format."""
try:
- response = requests.get(self.archived_tweet_url)
- if not (400 <= response.status_code <= 511):
+ response = httpx.get(self.archived_tweet_url)
+
+ if response and not (400 <= response.status_code <= 511):
json_data = response.json()
if "data" in json_data:
return json_data["data"].get("text", json_data["data"])
- elif "retweeted_status" in json_data:
+
+ if "retweeted_status" in json_data:
return json_data["retweeted_status"].get(
"text", json_data["retweeted_status"]
)
- else:
- return json_data.get("text", json_data)
- except Exception as e:
- print(f"Error parsing JSON mimetype tweet: {e}")
- return None
+
+ return json_data.get("text", json_data)
+ except Exception:
+ rprint(
+ f"[yellow]Connection error with {self.archived_tweet_url}. Error parsing the JSON, but the metadata was saved." # noqa: E501
+ )
+
+ return ""
class TweetsParser:
if key in self.parsed_tweets:
self.parsed_tweets[key].append(value)
- def parse(self):
- """Parses the archived tweets metadata and structures it."""
- for response in track(
- self.archived_tweets_response[1:],
- description=f"Wayback @{self.username} tweets\n",
- ):
- tweet_remove_char = unquote(response[2]).replace("’", "")
- cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
-
- wayback_machine_url = (
- f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
- )
+ def process_response(self, response):
+ """Process the archived tweet's response and add the relevant metadata."""
+ tweet_remove_char = unquote(response[2]).replace("’", "")
+ cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
+
+ wayback_machine_url = (
+ f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
+ )
+ original_tweet = delete_tweet_pathnames(
+ clean_tweet_url(cleaned_tweet, self.username)
+ )
+ parsed_wayback_machine_url = (
+ f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+ )
+
+ double_status = check_double_status(wayback_machine_url, original_tweet)
+
+ if double_status:
original_tweet = delete_tweet_pathnames(
- clean_tweet_url(cleaned_tweet, self.username)
+ f"https://twitter.com/{original_tweet}"
)
- parsed_wayback_machine_url = (
- f"https://web.archive.org/web/{response[1]}/{original_tweet}"
- )
-
- double_status = check_double_status(wayback_machine_url, original_tweet)
+ elif "://" not in original_tweet:
+ original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
- if double_status:
- original_tweet = delete_tweet_pathnames(
- f"https://twitter.com/{original_tweet}"
- )
- elif "://" not in original_tweet:
- original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
+ encoded_tweet = semicolon_parser(response[2])
+ encoded_archived_tweet = semicolon_parser(wayback_machine_url)
+ encoded_parsed_tweet = semicolon_parser(original_tweet)
+ encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
- encoded_tweet = semicolon_parser(response[2])
- encoded_archived_tweet = semicolon_parser(wayback_machine_url)
- encoded_parsed_tweet = semicolon_parser(original_tweet)
- encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+ embed_parser = TwitterEmbed(encoded_tweet)
+ content = embed_parser.embed()
- embed_parser = TwitterEmbed(encoded_tweet)
- content = embed_parser.embed()
+ if content:
+ self.add_metadata("available_tweet_text", semicolon_parser(content[0][0]))
+ self.add_metadata("available_tweet_is_RT", content[1][0])
+ self.add_metadata(
+ "available_tweet_username", semicolon_parser(content[2][0])
+ )
- if content:
- self.add_metadata(
- "available_tweet_text", semicolon_parser(content[0][0])
- )
- self.add_metadata("available_tweet_is_RT", content[1][0])
- self.add_metadata(
- "available_tweet_username", semicolon_parser(content[2][0])
- )
+ parsed_text_json = ""
- if response[3] == "application/json":
- json_parser = JsonParser(encoded_archived_tweet)
+ if response[3] == "application/json":
+ json_parser = JsonParser(encoded_archived_tweet)
+ if json_parser:
text_json = json_parser.parse()
parsed_text_json = semicolon_parser(text_json)
- else:
- parsed_text_json = None
- self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json)
+ self.add_metadata("parsed_tweet_text_mimetype_json", parsed_text_json)
+ self.add_metadata("archived_urlkey", response[0])
+ self.add_metadata("archived_timestamp", response[1])
+ self.add_metadata("original_tweet_url", encoded_tweet)
+ self.add_metadata("archived_tweet_url", encoded_archived_tweet)
+ self.add_metadata("parsed_tweet_url", encoded_parsed_tweet)
+ self.add_metadata("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+ self.add_metadata("archived_mimetype", response[3])
+ self.add_metadata("archived_statuscode", response[4])
+ self.add_metadata("archived_digest", response[5])
+ self.add_metadata("archived_length", response[6])
- self.add_metadata("archived_urlkey", response[0])
- self.add_metadata("archived_timestamp", response[1])
- self.add_metadata("original_tweet_url", encoded_tweet)
- self.add_metadata("archived_tweet_url", encoded_archived_tweet)
- self.add_metadata("parsed_tweet_url", encoded_parsed_tweet)
- self.add_metadata(
- "parsed_archived_tweet_url", encoded_parsed_archived_tweet
- )
- self.add_metadata("archived_mimetype", response[3])
- self.add_metadata("archived_statuscode", response[4])
- self.add_metadata("archived_digest", response[5])
- self.add_metadata("archived_length", response[6])
+ def parse(self):
+ """Parses the archived tweets metadata and structures it."""
+ with ThreadPoolExecutor(max_workers=10) as executor:
+
+ futures = {
+ executor.submit(self.process_response, response): response
+ for response in self.archived_tweets_response[1:]
+ }
+ with Progress() as progress:
+ task = progress.add_task(
+ f"Waybacking @{self.username} tweets\n", total=len(futures)
+ )
+
+ for future in as_completed(futures):
+ try:
+ with httpx.Client(timeout=60.0):
+ future.result()
+ except httpx.RequestError as e:
+ rprint(f"[red]{e}")
+ except Exception as e:
+ rprint(f"[red]{e}")
+
+ progress.update(task, advance=1)
- return self.parsed_tweets
+ return self.parsed_tweets
-import requests
+import httpx
+from rich import print as rprint
class WaybackTweets:
"""Requests data from the Wayback CDX Server API and returns it in JSON format."""
- def __init__(self, username, unique=False, timestamp_from="", timestamp_to=""):
+ def __init__(self, username, unique=False, timestamp_from=None, timestamp_to=None):
self.username = username
self.unique = unique
self.timestamp_from = timestamp_from
self.timestamp_to = timestamp_to
- def get(self):
- unique_param = "&collapse=urlkey" if self.unique else ""
- timestamp_from_param = (
- f"&from={self.timestamp_from}" if self.timestamp_from else ""
- )
- timestamp_to_param = f"&to={self.timestamp_to}" if self.timestamp_to else ""
-
- url = (
- f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{self.username}/status/*" # noqa: E501
- f"&output=json{unique_param}{timestamp_from_param}{timestamp_to_param}&limit=20" # noqa: E501
- )
+ async def get(self):
+ """GET request to the Internet Archive's CDX API to retrieve archived tweets."""
+ url = "https://web.archive.org/cdx/search/cdx"
+ params = {
+ "url": f"https://twitter.com/{self.username}/status/*",
+ "output": "json",
+ "limit": 1000,
+ }
+
+ if self.unique:
+ params["collapse"] = "urlkey"
+
+ if self.timestamp_from:
+ params["from"] = self.timestamp_from
+
+ if self.timestamp_to:
+ params["to"] = self.timestamp_to
+
print("Hi, archivist...")
try:
- response = requests.get(url)
- response.raise_for_status()
+ async with httpx.AsyncClient() as client:
+ response = await client.get(url, params=params)
if not (400 <= response.status_code <= 511):
return response.json()
- except requests.exceptions.Timeout as e:
- print(f"{e}.\nConnection to web.archive.org timed out.")
- except requests.exceptions.ConnectionError as e:
- print(f"{e}.\nFailed to establish a new connection with web.archive.org.")
- except requests.exceptions.HTTPError as e:
- print(
- f"{e}.\nTemporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information." # noqa: E501
+ except httpx._exceptions.ReadTimeout:
+ rprint("[red]Connection to web.archive.org timed out.")
+ except httpx._exceptions.ConnectError:
+ rprint("[red]Failed to establish a new connection with web.archive.org.")
+ except httpx._exceptions.HTTPError:
+ rprint(
+ "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501
)
except UnboundLocalError as e:
print(e)
- return None