From 61a37aab261c5738d96b5bad61332997084df531 Mon Sep 17 00:00:00 2001 From: Claromes Date: Fri, 14 Jun 2024 16:39:23 -0300 Subject: [PATCH] comment JsonParser --- app/app.py | 6 ++ docs/api.rst | 5 +- docs/errors.rst | 5 +- docs/result.rst | 3 +- poetry.lock | 105 +--------------------------- pyproject.toml | 1 - waybacktweets/api/parse_tweets.py | 26 ++++--- waybacktweets/api/request_tweets.py | 2 + waybacktweets/api/viz_tweets.py | 36 ++++++---- waybacktweets/cli/main.py | 2 +- waybacktweets/utils/utils.py | 6 +- 11 files changed, 61 insertions(+), 136 deletions(-) diff --git a/app/app.py b/app/app.py index c4a7ab7..7a38213 100644 --- a/app/app.py +++ b/app/app.py @@ -126,6 +126,9 @@ def tweets_count(username, archived_timestamp_filter): except requests.exceptions.ConnectionError: st.error("Failed to establish a new connection with web.archive.org.") st.stop() + except Exception as e: + st.error(f"{e}") + st.stop() # Interface Settings @@ -437,3 +440,6 @@ if query or st.session_state.count: If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501 ) st.session_state.offset = 0 + except Exception as e: + st.error(f"{e}") + st.stop() diff --git a/docs/api.rst b/docs/api.rst index 283e429..ced6bf4 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -22,8 +22,9 @@ Parse .. autoclass:: TwitterEmbed :members: -.. autoclass:: JsonParser - :members: +.. TODO: JSON Issue +.. .. autoclass:: JsonParser +.. :members: Export diff --git a/docs/errors.rst b/docs/errors.rst index 492279b..d12c436 100644 --- a/docs/errors.rst +++ b/docs/errors.rst @@ -17,9 +17,10 @@ This error is raised when the package fails to establish a new connection with w The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.`` -This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``. +.. TODO: JSON Issue +.. This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``. -The warning output message from the package would be: ``Connection error with https://web.archive.org/web//https://twitter.com//status/. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.`` +.. The warning output message from the package would be: ``Connection error with https://web.archive.org/web//https://twitter.com//status/. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.`` HTTPError ---------------- diff --git a/docs/result.rst b/docs/result.rst index 2794f40..38b5498 100644 --- a/docs/result.rst +++ b/docs/result.rst @@ -15,7 +15,8 @@ The package saves in three formats: CSV, JSON, and HTML. The files have the foll - ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. `Check the utility functions `_. -- ``parsed_tweet_text_mimetype_json``: (`str`) The tweet text extracted from the archived URL that has mimetype ``application/json``. +.. TODO: JSON Issue +.. - ``parsed_tweet_text_mimetype_json``: (`str`) The tweet text extracted from the archived URL that has mimetype ``application/json``. - ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account. diff --git a/poetry.lock b/poetry.lock index 12756fd..8346990 100644 --- a/poetry.lock +++ b/poetry.lock @@ -36,28 +36,6 @@ all = ["altair-tiles (>=0.3.0)", "anywidget (>=0.9.0)", "pyarrow (>=11)", "vega- dev = ["geopandas", "hatch", "ipython", "m2r", "mypy", "pandas-stubs", "pytest", "pytest-cov", "ruff (>=0.3.0)", "types-jsonschema", "types-setuptools"] doc = ["docutils", "jinja2", "myst-parser", "numpydoc", "pillow (>=9,<10)", "pydata-sphinx-theme (>=0.14.1)", "scipy", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinxext-altair"] -[[package]] -name = "anyio" -version = "4.4.0" -description = "High level compatibility layer for multiple asynchronous event loop implementations" -optional = false -python-versions = ">=3.8" -files = [ - {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"}, - {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"}, -] - -[package.dependencies] -exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} -idna = ">=2.8" -sniffio = ">=1.1" -typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} - -[package.extras] -doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] -trio = ["trio (>=0.23)"] - [[package]] name = "attrs" version = "23.2.0" @@ -327,20 +305,6 @@ files = [ {file = "docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f"}, ] -[[package]] -name = "exceptiongroup" -version = "1.2.1" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, - {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, -] - -[package.extras] -test = ["pytest (>=6)"] - [[package]] name = "filelock" version = "3.15.1" @@ -422,62 +386,6 @@ gitdb = ">=4.0.1,<5" doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] -[[package]] -name = "h11" -version = "0.14.0" -description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -optional = false -python-versions = ">=3.7" -files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, -] - -[[package]] -name = "httpcore" -version = "1.0.5" -description = "A minimal low-level HTTP client." -optional = false -python-versions = ">=3.8" -files = [ - {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"}, - {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"}, -] - -[package.dependencies] -certifi = "*" -h11 = ">=0.13,<0.15" - -[package.extras] -asyncio = ["anyio (>=4.0,<5.0)"] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] -trio = ["trio (>=0.22.0,<0.26.0)"] - -[[package]] -name = "httpx" -version = "0.27.0" -description = "The next generation HTTP client." -optional = false -python-versions = ">=3.8" -files = [ - {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"}, - {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"}, -] - -[package.dependencies] -anyio = "*" -certifi = "*" -httpcore = "==1.*" -idna = "*" -sniffio = "*" - -[package.extras] -brotli = ["brotli", "brotlicffi"] -cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] - [[package]] name = "identify" version = "2.5.36" @@ -1403,17 +1311,6 @@ files = [ {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"}, ] -[[package]] -name = "sniffio" -version = "1.3.1" -description = "Sniff out which async library your code is running under" -optional = false -python-versions = ">=3.7" -files = [ - {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, - {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, -] - [[package]] name = "snowballstemmer" version = "2.2.0" @@ -1843,4 +1740,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0" -content-hash = "6b4f6eedd706b20782b173657a9e8936e20853014e2ea504f064765cd42be4f7" +content-hash = "e2870692e02e31ac100b8f245b07118ea693b67898444ea22ab43963b8feb944" diff --git a/pyproject.toml b/pyproject.toml index 7154426..38b4706 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ python = ">=3.9,<3.9.7 || >3.9.7,<4.0" requests = "^2.30.0" streamlit = "1.35.0" rich = "^13.6.0" -httpx = "^0.27.0" click = "^8.1.7" [tool.poetry.group.docs.dependencies] diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py index bc1704a..465287f 100644 --- a/waybacktweets/api/parse_tweets.py +++ b/waybacktweets/api/parse_tweets.py @@ -65,8 +65,12 @@ class TwitterEmbed: except exceptions: rprint("[yellow]Error parsing the tweet, but the CDX data was saved.") return None + except Exception as e: + rprint(f"[red]{e}") + return None +# TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501 class JsonParser: """Handles parsing of tweets when the mimetype is application/json.""" @@ -99,6 +103,9 @@ class JsonParser: rprint("[yellow]Error parsing the JSON, but the CDX data was saved.") return "" + except Exception as e: + rprint(f"[red]{e}") + return "" class TweetsParser: @@ -155,15 +162,18 @@ class TweetsParser: self._add_field("available_tweet_is_RT", content[1][0]) self._add_field("available_tweet_info", semicolon_parser(content[2][0])) - parsed_text_json = "" + # TODO: JSON Issue + # parsed_text_json = "" + + # if response[3] == "application/json": + # json_parser = JsonParser(encoded_parsed_archived_tweet) + # text_json = json_parser.parse() + + # if text_json: + # parsed_text_json = semicolon_parser(text_json) - if response[3] == "application/json": - json_parser = JsonParser(encoded_parsed_archived_tweet) - if json_parser: - text_json = json_parser.parse() - parsed_text_json = semicolon_parser(text_json) + # self._add_field("parsed_tweet_text_mimetype_json", parsed_text_json) - self._add_field("parsed_tweet_text_mimetype_json", parsed_text_json) self._add_field("archived_urlkey", response[0]) self._add_field("archived_timestamp", response[1]) self._add_field("original_tweet_url", encoded_tweet) @@ -192,7 +202,7 @@ class TweetsParser: try: future.result() except Exception as e: - rprint(f"[red]{e}...") + rprint(f"[red]{e}") progress.update(task, advance=1) diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py index 72eaf89..7bb9dd1 100644 --- a/waybacktweets/api/request_tweets.py +++ b/waybacktweets/api/request_tweets.py @@ -55,3 +55,5 @@ class WaybackTweets: rprint( "[red]Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information." # noqa: E501 ) + except Exception as e: + rprint(f"[red]{e}") diff --git a/waybacktweets/api/viz_tweets.py b/waybacktweets/api/viz_tweets.py index 61b242e..229bf86 100644 --- a/waybacktweets/api/viz_tweets.py +++ b/waybacktweets/api/viz_tweets.py @@ -38,17 +38,22 @@ class HTMLTweetsVisualizer: for tweet in self.json_content: html += '
\n' + # TODO: JSON Issue + # if ( + # ( + # tweet["archived_mimetype"] != "application/json" + # and not tweet["parsed_tweet_text_mimetype_json"] + # ) + # and not tweet["available_tweet_text"] + # ) or ( + # ( + # tweet["archived_mimetype"] == "application/json" + # and not tweet["parsed_tweet_text_mimetype_json"] + # ) + # and not tweet["available_tweet_text"] + # ): if ( - ( - tweet["archived_mimetype"] != "application/json" - and not tweet["parsed_tweet_text_mimetype_json"] - ) - and not tweet["available_tweet_text"] - ) or ( - ( - tweet["archived_mimetype"] == "application/json" - and not tweet["parsed_tweet_text_mimetype_json"] - ) + tweet["archived_mimetype"] != "application/json" and not tweet["available_tweet_text"] ): html += f'\n' @@ -64,11 +69,12 @@ class HTMLTweetsVisualizer: html += f'

Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}

\n' html += f'

Available Tweet Username: {tweet["available_tweet_info"]}

\n' - if ( - tweet["archived_mimetype"] == "application/json" - and tweet["parsed_tweet_text_mimetype_json"] - ) and not tweet["available_tweet_text"]: - html += f'

Parsed Tweet Text (application/json): {tweet["parsed_tweet_text_mimetype_json"]}

\n' + # TODO: JSON Issue + # if ( + # tweet["archived_mimetype"] == "application/json" + # and tweet["parsed_tweet_text_mimetype_json"] + # ) and not tweet["available_tweet_text"]: + # html += f'

Parsed Tweet Text (application/json): {tweet["parsed_tweet_text_mimetype_json"]}

\n' html += "
\n" html += f'

Archived URL Key: {tweet["archived_urlkey"]}

\n' diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py index 7d01d05..421980d 100644 --- a/waybacktweets/cli/main.py +++ b/waybacktweets/cli/main.py @@ -75,7 +75,7 @@ def cli( "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", - "parsed_tweet_text_mimetype_json", + # "parsed_tweet_text_mimetype_json", # TODO: JSON Issue "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py index eff374a..cfa29bb 100644 --- a/waybacktweets/utils/utils.py +++ b/waybacktweets/utils/utils.py @@ -25,8 +25,10 @@ def get_response(url, params=None): response = session.get(url, params=params, headers=headers) - if not 400 <= response.status_code <= 511: - return response + if 400 <= response.status_code <= 511: + return None + + return response def clean_tweet_url(tweet_url, username): -- 2.34.1