From: Claromes Date: Mon, 17 Jun 2024 10:08:30 +0000 (-0300) Subject: update docs, update docstrings style X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=b70d207ae959d560ab1cedeb14cb1620248a4a84;p=waybacktweets.git update docs, update docstrings style --- diff --git a/docs/api.rst b/docs/api.rst index 7e7ab9f..2cfa7af 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,43 +1,14 @@ API ==== -Config ------------- - -.. automodule:: waybacktweets.config.config - :members: - - -Exceptions ------------- - -.. automodule:: waybacktweets.exceptions.exceptions - -.. autoclass:: ReadTimeoutError - :members: - -.. autoclass:: ConnectionError - :members: - -.. autoclass:: HTTPError - :members: - -.. autoclass:: EmptyResponseError - :members: - -.. autoclass:: GetResponseError - :members: - - -Export +Request --------- -.. automodule:: waybacktweets.api.export +.. automodule:: waybacktweets.api.request -.. autoclass:: TweetsExporter +.. autoclass:: WaybackTweets :members: - Parse --------- @@ -45,6 +16,7 @@ Parse .. autoclass:: TweetsParser :members: + :private-members: .. autoclass:: TwitterEmbed :members: @@ -52,15 +24,23 @@ Parse .. autoclass:: JsonParser :members: - -Request +Export --------- -.. automodule:: waybacktweets.api.request +.. automodule:: waybacktweets.api.export -.. autoclass:: WaybackTweets +.. autoclass:: TweetsExporter :members: + :private-members: + +Visualize +----------- + +.. automodule:: waybacktweets.api.visualize +.. autoclass:: HTMLTweetsVisualizer + :members: + :private-members: Utils ------- @@ -76,11 +56,28 @@ Utils .. autofunction:: is_tweet_url .. autofunction:: semicolon_parser +Exceptions +------------ -Visualizer ------------ +.. automodule:: waybacktweets.exceptions.exceptions -.. automodule:: waybacktweets.api.visualize +.. autoclass:: ReadTimeoutError + :members: -.. autoclass:: HTMLTweetsVisualizer +.. autoclass:: ConnectionError + :members: + +.. autoclass:: HTTPError + :members: + +.. autoclass:: EmptyResponseError + :members: + +.. autoclass:: GetResponseError + :members: + +Config +------------ + +.. automodule:: waybacktweets.config.config :members: diff --git a/docs/conf.py b/docs/conf.py index 5692fea..599c92f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,16 +11,19 @@ author = "Claromes" extensions = [ "sphinx.ext.autodoc", + "sphinx.ext.napoleon", "sphinx.ext.extlinks", "sphinx.ext.intersphinx", "pallets_sphinx_themes", "sphinxcontrib.mermaid", "sphinx_new_tab_link", "sphinx_click.ext", + "sphinx_autodoc_typehints", ] templates_path = ["_templates"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +autodoc_typehints = "description" # -- Options for HTML output ------------------------------------------------- diff --git a/docs/contribute.rst b/docs/contribute.rst index 0191658..6bfb7cc 100644 --- a/docs/contribute.rst +++ b/docs/contribute.rst @@ -19,7 +19,7 @@ These are the prerequisites: - Python 3.11+ - Poetry -Install from the source, following `these instructions `_. +Install from the source, following the :ref:`installation` instructions. Brief explanation about the code under the Wayback Tweets directory: diff --git a/docs/field_options.rst b/docs/field_options.rst new file mode 100644 index 0000000..02f4228 --- /dev/null +++ b/docs/field_options.rst @@ -0,0 +1,43 @@ +.. _field_options: + +Field Options +================ + +The package saves in three formats: CSV, JSON, and HTML. The files have the following fields: + +- ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing. + +- ``archived_timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format. + +- ``original_tweet_url``: (`str`) The original tweet URL. + +- ``archived_tweet_url``: (`str`) The original archived URL. + +- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. `Check the utility functions `_. + +- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. `Check the utility functions `_. + +.. TODO: JSON Issue +.. - ``parsed_tweet_text_mimetype_json``: (`str`) The tweet text extracted from the archived URL that has mimetype ``application/json``. + +- ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account. + +- ``available_tweet_is_RT``: (`bool`) Whether the tweet from the ``available_tweet_text`` field is a retweet or not. + +- ``available_tweet_info``: (`str`) Name and date of the tweet from the ``available_tweet_text`` field. + +- ``archived_mimetype``: (`str`) The mimetype of the archived content, which can be one of these: + + - ``text/html`` + + - ``warc/revisit`` + + - ``application/json`` + + - ``unk`` + +- ``archived_statuscode``: (`str`) The HTTP status code of the snapshot. If the mimetype is ``warc/revisit``, the value returned for the ``statuscode`` key can be blank, but the actual value is the same as that of any other entry that has the same ``digest`` as this entry. If the mimetype is ``application/json``, the value is usually empty or ``-``. + +- ``archived_digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string. + +- ``archived_length``: (`int`) The compressed byte size of the corresponding WARC record, which includes WARC headers, HTTP headers, and content payload. diff --git a/docs/index.rst b/docs/index.rst index 9bf5bcd..ec945d2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,6 +8,9 @@ Wayback Tweets Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats. +.. note:: + Intensive queries can lead to rate limiting, resulting in a temporary ban of a few minutes from web.archive.org. + User Guide ------------ @@ -18,7 +21,7 @@ User Guide installation quickstart workflow - result + field_options exceptions contribute todo diff --git a/docs/installation.rst b/docs/installation.rst index e2df76b..5ffb064 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,3 +1,5 @@ +.. _installation: + Installation ================ diff --git a/docs/result.rst b/docs/result.rst deleted file mode 100644 index 38b5498..0000000 --- a/docs/result.rst +++ /dev/null @@ -1,41 +0,0 @@ -Result -================ - -The package saves in three formats: CSV, JSON, and HTML. The files have the following fields: - -- ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing. - -- ``archived_timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format. - -- ``original_tweet_url``: (`str`) The original tweet URL. - -- ``archived_tweet_url``: (`str`) The original archived URL. - -- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. `Check the utility functions `_. - -- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. `Check the utility functions `_. - -.. TODO: JSON Issue -.. - ``parsed_tweet_text_mimetype_json``: (`str`) The tweet text extracted from the archived URL that has mimetype ``application/json``. - -- ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account. - -- ``available_tweet_is_RT``: (`bool`) Whether the tweet from the ``available_tweet_text`` field is a retweet or not. - -- ``available_tweet_info``: (`str`) Name and date of the tweet from the ``available_tweet_text`` field. - -- ``archived_mimetype``: (`str`) The mimetype of the archived content, which can be one of these: - - - ``text/html`` - - - ``warc/revisit`` - - - ``application/json`` - - - ``unk`` - -- ``archived_statuscode``: (`str`) The HTTP status code of the snapshot. If the mimetype is ``warc/revisit``, the value returned for the ``statuscode`` key can be blank, but the actual value is the same as that of any other entry that has the same ``digest`` as this entry. If the mimetype is ``application/json``, the value is usually empty or ``-``. - -- ``archived_digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string. - -- ``archived_length``: (`int`) The compressed byte size of the corresponding WARC record, which includes WARC headers, HTTP headers, and content payload. diff --git a/docs/streamlit.rst b/docs/streamlit.rst index b8de7d9..bdf360c 100644 --- a/docs/streamlit.rst +++ b/docs/streamlit.rst @@ -1,3 +1,6 @@ +.. note:: + The current version of the Web App is v0.4.3. Version 1.0 has not yet been implemented in the Streamlit Web App, as it is in the review and testing phase. + Web App ========= @@ -8,9 +11,9 @@ Aplication that displays multiple archived tweets on Wayback Machine to avoid op Filters ---------- -- Filtering by date range: Using the `from` and `to` filters +- Filtering by date range: Using the ``from`` and ``to`` filters -- Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the `flowchart `_) +- Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the :ref:`flowchart`) - Only unique URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix`` diff --git a/docs/todo.rst b/docs/todo.rst index eaced03..b684cae 100644 --- a/docs/todo.rst +++ b/docs/todo.rst @@ -5,13 +5,15 @@ TODO -|uncheck| Code: JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting (`Planned for v1.1`) +|uncheck| JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting (`Planned for v1.2`) -|uncheck| Code: Download images when tweet URL has extensions like JPG or PNG (`Planned for v1.2`) +|uncheck| Download images when tweet URL has extensions like JPG or PNG (`Planned for v1.2`) -|uncheck| Code: Develop a scraper to download snapshots from https://archive.today (`Not planned`) +|uncheck| Develop a scraper to download snapshots from https://archive.today (`Not planned`) -|uncheck| Code: Unit Tests (`Planned`) +|uncheck| Unit Tests (`Planned for v1.1`) -|uncheck| Code: Mapping and parsing of other Twitter-related URLs (`Planned`) +|uncheck| Mapping and parsing of other Twitter-related URLs (`Planned`) + +|uncheck| Review and publish the new version of the Streamlit Web App (`Planned for v1.0.1`) diff --git a/docs/workflow.rst b/docs/workflow.rst index 2480b35..046443c 100644 --- a/docs/workflow.rst +++ b/docs/workflow.rst @@ -1,3 +1,5 @@ +.. _flowchart: + Workflow ================ diff --git a/poetry.lock b/poetry.lock index 8346990..b92291e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1358,6 +1358,25 @@ docs = ["sphinxcontrib-websupport"] lint = ["flake8 (>=3.5.0)", "importlib_metadata", "mypy (==1.9.0)", "pytest (>=6.0)", "ruff (==0.3.7)", "sphinx-lint", "tomli", "types-docutils", "types-requests"] test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=6.0)", "setuptools (>=67.0)"] +[[package]] +name = "sphinx-autodoc-typehints" +version = "2.1.1" +description = "Type hints (PEP 484) support for the Sphinx autodoc extension" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinx_autodoc_typehints-2.1.1-py3-none-any.whl", hash = "sha256:22427d74786274add2b6d4afccb8b3c8c1843f48a704550f15a35fd948f8a4de"}, + {file = "sphinx_autodoc_typehints-2.1.1.tar.gz", hash = "sha256:0072b65f5ab2818c229d6d6c2cc993770af55d36bb7bfb16001e2fce4d14880c"}, +] + +[package.dependencies] +sphinx = ">=7.3.5" + +[package.extras] +docs = ["furo (>=2024.1.29)"] +numpy = ["nptyping (>=2.5)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.4.4)", "defusedxml (>=0.7.1)", "diff-cover (>=9)", "pytest (>=8.1.1)", "pytest-cov (>=5)", "sphobjinv (>=2.3.1)", "typing-extensions (>=4.11)"] + [[package]] name = "sphinx-click" version = "6.0.0" @@ -1740,4 +1759,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.9.7 || >3.9.7,<4.0" -content-hash = "e2870692e02e31ac100b8f245b07118ea693b67898444ea22ab43963b8feb944" +content-hash = "ae61f09c64379a426d38a928c465c33775dfc8cf2da26bf33709e81cfae80aa5" diff --git a/pyproject.toml b/pyproject.toml index abd71e5..b4db336 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ pallets-sphinx-themes = "^2.1.3" sphinxcontrib-mermaid = "^0.9.2" sphinx-new-tab-link = "^0.4.0" sphinx-click = "^6.0.0" +sphinx-autodoc-typehints = "^2.1.1" [tool.poetry.group.dev.dependencies] black = "^24.4.2" diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index 6039477..d189c20 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -14,19 +14,20 @@ from waybacktweets.api.request import WaybackTweets from waybacktweets.config.config import config -def parse_date( +def _parse_date( ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None ) -> Optional[str]: """ Parses a date string and returns it in the format "YYYYMMDD". - :param ctx: Necessary when used with the click package. Defaults to None. - :param param: Necessary when used with the click package. Defaults to None. - :param value: A date string in the "YYYYMMDD" format. Defaults to None. + Args: + ctx: Necessary when used with the click package. Defaults to None. + param: Necessary when used with the click package. Defaults to None. + value: A date string in the "YYYYMMDD" format. Defaults to None. - :returns: The input date string formatted in the "YYYYMMDD" format, - or None if no date string was provided. - """ + Returns: + The input date string formatted in the "YYYYMMDD" format, or None if no date string was provided. + """ # noqa: E501 try: if value is None: return None @@ -53,7 +54,7 @@ def parse_date( "timestamp_from", type=click.UNPROCESSED, metavar="DATE", - callback=parse_date, + callback=_parse_date, default=None, help="Filtering by date range from this date. Format: YYYYmmdd", ) @@ -63,7 +64,7 @@ def parse_date( "timestamp_to", type=click.UNPROCESSED, metavar="DATE", - callback=parse_date, + callback=_parse_date, default=None, help="Filtering by date range up to this date. Format: YYYYmmdd", ) @@ -109,11 +110,10 @@ def main( verbose: Optional[bool], ) -> None: """ - Retrieves archived tweets CDX data from the Wayback Machine, - performs necessary parsing, and saves the data. + Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data. USERNAME: The Twitter username without @. - """ + """ # noqa: E501 try: config.verbose = verbose diff --git a/waybacktweets/api/export.py b/waybacktweets/api/export.py index 7751679..a599f68 100644 --- a/waybacktweets/api/export.py +++ b/waybacktweets/api/export.py @@ -16,9 +16,10 @@ class TweetsExporter: """ Class responsible for exporting parsed archived tweets. - :param data: The parsed archived tweets data. - :param username: The username associated with the tweets. - :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". + Args: + data (Dict[str, List[Any]]): The parsed archived tweets data. + username (str): The username associated with the tweets. + field_options (List[str]): The fields to be included in the exported data. For more details on each option, visit :ref:`field_options`. """ # noqa: E501 def __init__( @@ -36,7 +37,8 @@ class TweetsExporter: """ Returns the current datetime, formatted as a string. - :returns: The current datetime. + Returns: + The current datetime. """ now = datetime.datetime.now() formatted_now = now.strftime("%Y%m%d%H%M%S") @@ -49,14 +51,15 @@ class TweetsExporter: data: Dict[str, List[Any]], fill_value: Optional[Any] = None ) -> List[List[Any]]: """ - Transposes a matrix, - filling in missing values with a specified fill value if needed. + Transposes a matrix, filling in missing values with a specified fill value if needed. - :param data: The matrix to be transposed. - :param fill_value: The value to fill in missing values with. + Args: + data (Dict[str, List[Any]]): The matrix to be transposed. + fill_value (Optional[Any]): The value to fill in missing values with. - :returns: The transposed matrix. - """ + Returns: + The transposed matrix. + """ # noqa: E501 max_length = max(len(sublist) for sublist in data.values()) filled_data = { @@ -72,7 +75,8 @@ class TweetsExporter: """ Creates a DataFrame from the transposed data. - :returns: The DataFrame representation of the data. + Returns: + The DataFrame representation of the data. """ data_transposed = self._transpose_matrix(self.data) diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py index 519696c..b6f604a 100644 --- a/waybacktweets/api/parse.py +++ b/waybacktweets/api/parse.py @@ -30,9 +30,10 @@ from waybacktweets.utils.utils import ( class TwitterEmbed: """ - Class responsible for parsing tweets using the Twitter Publish service. + This class is responsible for parsing tweets using the Twitter Publish service. - :param tweet_url: The URL of the tweet to be parsed. + Args: + tweet_url (str): The URL of the tweet to be parsed. """ def __init__(self, tweet_url: str): @@ -42,20 +43,15 @@ class TwitterEmbed: """ Parses the archived tweets when they are still available. - This function goes through each archived tweet and checks - if it is still available. - If the tweet is available, it extracts the necessary information - and adds it to the respective lists. - The function returns a tuple of three lists: + This function goes through each archived tweet and checks if it is still available. If the tweet is available, it extracts the necessary information and adds it to the respective lists. The function returns a tuple of three lists: + - The first list contains the tweet texts. - - The second list contains boolean values indicating whether each tweet - is still available. + - The second list contains boolean values indicating whether each tweet is still available. - The third list contains the URLs of the tweets. - :returns: A tuple of three lists containing the tweet texts, - availability statuses, and URLs, respectively. If no tweets are available, - returns None. - """ + Returns: + A tuple of three lists containing the tweet texts, availability statuses, and URLs, respectively. If no tweets are available, returns None. + """ # noqa: E501 try: url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" response = get_response(url=url) @@ -110,12 +106,13 @@ class TwitterEmbed: class JsonParser: """ - Class responsible for parsing tweets when the mimetype is application/json.\n - Note: This class is in an experimental phase, but it is currently being - used by the Streamlit Web App. + This class is responsible for parsing tweets when the mimetype is application/json. - :param archived_tweet_url: The URL of the archived tweet to be parsed. - """ + Note: This class is in an experimental phase, but it is currently being used by the Streamlit Web App. + + Args: + archived_tweet_url (str): The URL of the archived tweet to be parsed. + """ # noqa: E501 def __init__(self, archived_tweet_url: str): self.archived_tweet_url = archived_tweet_url @@ -124,7 +121,8 @@ class JsonParser: """ Parses the archived tweets in JSON format. - :returns: The parsed tweet text. + Returns: + The parsed tweet text. """ try: response = get_response(url=self.archived_tweet_url) @@ -155,11 +153,12 @@ class JsonParser: class TweetsParser: """ - Class responsible for the overall parsing of archived tweets. + This class is responsible for the overall parsing of archived tweets. - :param archived_tweets_response: The response from the archived tweets. - :param username: The username associated with the tweets. - :param field_options: The fields to be included in the parsed data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". + Args: + archived_tweets_response (List[str]): The response from the archived tweets. + username (str): The username associated with the tweets. + field_options (List[str]): The fields to be included in the parsed data. For more details on each option, visit :ref:`field_options`. """ # noqa: E501 def __init__( @@ -177,8 +176,9 @@ class TweetsParser: """ Appends a value to a list in the parsed data structure. - :param key: The key in the parsed data structure. - :param value: The value to be appended. + Args: + key (str): The key in the parsed data structure. + value (Any): The value to be appended. """ if key in self.parsed_tweets: self.parsed_tweets[key].append(value) @@ -187,7 +187,8 @@ class TweetsParser: """ Processes the archived tweet's response and adds the relevant CDX data. - :param response: The response from the archived tweet. + Args: + response (List[str]): The response from the archived tweet. """ tweet_remove_char = unquote(response[2]).replace("’", "") cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"') @@ -250,10 +251,12 @@ class TweetsParser: """ Parses the archived tweets CDX data and structures it. - :param print_progress: A boolean indicating whether to print progress or not. + Args: + print_progress (bool): A boolean indicating whether to print progress or not. - :returns: The parsed tweets data. - """ + Returns: + The parsed tweets data. + """ # noqa: E501 with ThreadPoolExecutor(max_workers=10) as executor: futures = { diff --git a/waybacktweets/api/request.py b/waybacktweets/api/request.py index d7d37a1..edf68e4 100644 --- a/waybacktweets/api/request.py +++ b/waybacktweets/api/request.py @@ -21,13 +21,14 @@ class WaybackTweets: """ Class responsible for requesting data from the Wayback CDX Server API. - :param username: The username associated with the tweets. - :param collapse: The field to collapse duplicate lines on. - :param timestamp_from: The timestamp to start retrieving tweets from. - :param timestamp_to: The timestamp to stop retrieving tweets at. - :param limit: The maximum number of results to return. - :param offset: The number of lines to skip in the results. - :param matchType: Results matching a certain prefix, a certain host or all subdomains. + Args: + username (str): The username associated with the tweets. + collapse (str, optional): The field to collapse duplicate lines on. + timestamp_from (str, optional): The timestamp to start retrieving tweets from. + timestamp_to (str, optional): The timestamp to stop retrieving tweets at. + limit (int, optional): The maximum number of results to return. + offset (int, optional): The number of lines to skip in the results. + matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains. """ # noqa: E501 def __init__( @@ -50,11 +51,11 @@ class WaybackTweets: def get(self) -> Optional[Dict[str, Any]]: """ - Sends a GET request to the Internet Archive's CDX API - to retrieve archived tweets. + Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets. - :returns: The response from the CDX API in JSON format, if successful. - """ + Returns: + The response from the CDX API in JSON format, if successful. Otherwise, None. + """ # noqa: E501 url = "https://web.archive.org/cdx/search/cdx" status_pathname = "status/*" diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py index ef13aac..70824e7 100644 --- a/waybacktweets/api/visualize.py +++ b/waybacktweets/api/visualize.py @@ -11,13 +11,14 @@ class HTMLTweetsVisualizer: """ Class responsible for generating an HTML file to visualize the parsed data. - :param json_content: The content of the JSON file. - :param html_file_path: The path where the HTML file will be saved. - :param username: The username associated with the tweets. + Args: + json_file_path (str): The path of the JSON file. + html_file_path (str): The path where the HTML file will be saved. + username (str): The username associated with the tweets. """ def __init__(self, json_file_path: str, html_file_path: str, username: str): - self.json_content = self._json_loader(json_file_path) + self.json_file_path = self._json_loader(json_file_path) self.html_file_path = html_file_path self.username = username @@ -26,9 +27,11 @@ class HTMLTweetsVisualizer: """ Reads and loads JSON data from a specified file path. - :param json_file_path: The path of the JSON file. + Args: + json_file_path (str): The path of the JSON file. - :returns: The content of the JSON file. + Returns: + The content of the JSON file. """ with open(json_file_path, "r", encoding="utf-8") as f: return json.load(f) @@ -37,7 +40,8 @@ class HTMLTweetsVisualizer: """ Generates an HTML string that represents the parsed data. - :returns: The generated HTML string. + Returns: + The generated HTML string. """ html = f"\n\n@{self.username} archived tweets\n" @@ -56,23 +60,9 @@ class HTMLTweetsVisualizer: html += f"

@{self.username} archived tweets

\n" html += '
\n' - for tweet in self.json_content: + for tweet in self.json_file_path: html += '
\n' - # TODO: JSON Issue - # if ( - # ( - # tweet["archived_mimetype"] != "application/json" - # and not tweet["parsed_tweet_text_mimetype_json"] - # ) - # and not tweet["available_tweet_text"] - # ) or ( - # ( - # tweet["archived_mimetype"] == "application/json" - # and not tweet["parsed_tweet_text_mimetype_json"] - # ) - # and not tweet["available_tweet_text"] - # ): if ( tweet["archived_mimetype"] != "application/json" and not tweet["available_tweet_text"] @@ -90,13 +80,6 @@ class HTMLTweetsVisualizer: html += f'

Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}

\n' html += f'

Available Tweet Username: {tweet["available_tweet_info"]}

\n' - # TODO: JSON Issue - # if ( - # tweet["archived_mimetype"] == "application/json" - # and tweet["parsed_tweet_text_mimetype_json"] - # ) and not tweet["available_tweet_text"]: - # html += f'

Parsed Tweet Text (application/json): {tweet["parsed_tweet_text_mimetype_json"]}

\n' - html += "
\n" html += f'

Archived URL Key: {tweet["archived_urlkey"]}

\n' html += f'

Archived Timestamp: {tweet["archived_timestamp"]}

\n' @@ -120,7 +103,8 @@ class HTMLTweetsVisualizer: """ Saves the generated HTML string to a file. - :param html_content: The HTML string to be saved. + Args: + html_content (str): The HTML string to be saved. """ with open(self.html_file_path, "w", encoding="utf-8") as f: f.write(html_content) diff --git a/waybacktweets/config/config.py b/waybacktweets/config/config.py index 5d1ab67..ae986be 100644 --- a/waybacktweets/config/config.py +++ b/waybacktweets/config/config.py @@ -1,18 +1,28 @@ """ +Configuration module. + Manages global configuration settings throughout the application. """ +from dataclasses import dataclass + +@dataclass class _Config: - def __init__(self, verbose: bool = True): - self.verbose = verbose + """ + A class used to represent the configuration settings. + + Attributes: + verbose (bool): Determines if verbose logging should be enabled. + """ + + verbose: bool = True config = _Config() """ -Configuration settings. - -.. attribute:: verbose +Global configuration instance. - Determines if verbose logging should be enabled. +Attributes: + verbose (bool): Determines if verbose logging should be enabled. """ diff --git a/waybacktweets/exceptions/exceptions.py b/waybacktweets/exceptions/exceptions.py index 383fb80..ec98113 100644 --- a/waybacktweets/exceptions/exceptions.py +++ b/waybacktweets/exceptions/exceptions.py @@ -4,20 +4,30 @@ Wayback Tweets Exceptions class GetResponseError(Exception): - """Base class for exceptions in get_response.""" + """ + Base class for exceptions in get_response. + """ class ReadTimeoutError(GetResponseError): - """Exception raised for read timeout errors.""" + """ + Exception raised for read timeout errors. + """ class ConnectionError(GetResponseError): - """Exception raised for connection errors.""" + """ + Exception raised for connection errors. + """ class HTTPError(GetResponseError): - """Exception raised for HTTP errors.""" + """ + Exception raised for HTTP errors. + """ class EmptyResponseError(GetResponseError): - """Exception raised for empty responses.""" + """ + Exception raised for empty responses. + """ diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py index 3261434..ada66b8 100644 --- a/waybacktweets/utils/utils.py +++ b/waybacktweets/utils/utils.py @@ -24,15 +24,18 @@ def get_response( """ Sends a GET request to the specified URL and returns the response. - :param url: The URL to send the GET request to. - :param params: The parameters to include in the GET request. + Args: + url (str): The URL to send the GET request to. + params (dict, optional): The parameters to include in the GET request. - :returns: The response from the server. + Returns: + The response from the server. - :raises ReadTimeoutError: If a read timeout occurs. - :raises ConnectionError: If a connection error occurs. - :raises HTTPError: If an HTTP error occurs. - :raises EmptyResponseError: If the response is empty. + Raises: + ReadTimeoutError: If a read timeout occurs. + ConnectionError: If a connection error occurs. + HTTPError: If an HTTP error occurs. + EmptyResponseError: If the response is empty. """ session = requests.Session() retry = Retry(connect=3, backoff_factor=0.3) @@ -65,10 +68,12 @@ def clean_tweet_url(tweet_url: str, username: str) -> str: """ Cleans a tweet URL by ensuring it is associated with the correct username. - :param tweet_url: The tweet URL to clean. - :param username: The username to associate with the tweet URL. + Args: + tweet_url (str): The tweet URL to clean. + username (str): The username to associate with the tweet URL. - :returns: The cleaned tweet URL. + Returns: + The cleaned tweet URL. """ tweet_lower = tweet_url.lower() @@ -86,15 +91,16 @@ def clean_wayback_machine_url( wayback_machine_url: str, archived_timestamp: str, username: str ) -> str: """ - Cleans a Wayback Machine URL by ensuring it is associated with the correct username - and timestamp. + Cleans a Wayback Machine URL by ensuring it is associated with the correct username and timestamp. - :param wayback_machine_url: The Wayback Machine URL to clean. - :param archived_timestamp: The timestamp to associate with the Wayback Machine URL. - :param username: The username to associate with the Wayback Machine URL. + Args: + wayback_machine_url (str): The Wayback Machine URL to clean. + archived_timestamp (str): The timestamp to associate with the Wayback Machine URL. + username (str): The username to associate with the Wayback Machine URL. - :returns: The cleaned Wayback Machine URL. - """ + Returns: + The cleaned Wayback Machine URL. + """ # noqa: E501 wayback_machine_url = wayback_machine_url.lower() pattern = re.compile(r"/status/(\d+)") @@ -110,9 +116,11 @@ def check_pattern_tweet(tweet_url: str) -> str: """ Extracts the tweet ID from a tweet URL. - :param tweet_url: The tweet URL to extract the ID from. + Args: + tweet_url (str): The tweet URL to extract the ID from. - :returns: The extracted tweet ID. + Returns: + The extracted tweet ID. """ pattern = re.compile(r'/status/"([^"]+)"') @@ -127,9 +135,11 @@ def delete_tweet_pathnames(tweet_url: str) -> str: """ Removes any pathnames from a tweet URL. - :param tweet_url: The tweet URL to remove pathnames from. + Args: + tweet_url (str): The tweet URL to remove pathnames from. - :returns: The tweet URL without any pathnames. + Returns: + The tweet URL without any pathnames. """ pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+") match_username = pattern_username.match(tweet_url) @@ -147,14 +157,15 @@ def delete_tweet_pathnames(tweet_url: str) -> str: def check_double_status(wayback_machine_url: str, original_tweet_url: str) -> bool: """ - Checks if a Wayback Machine URL contains two occurrences of "/status/" - and if the original tweet does not contain "twitter.com". + Checks if a Wayback Machine URL contains two occurrences of "/status/" and if the original tweet does not contain "twitter.com". - :param wayback_machine_url: The Wayback Machine URL to check. - :param original_tweet_url: The original tweet URL to check. + Args: + wayback_machine_url (str): The Wayback Machine URL to check. + original_tweet_url (str): The original tweet URL to check. - :returns: True if the conditions are met, False otherwise. - """ + Returns: + True if the conditions are met, False otherwise. + """ # noqa: E501 if ( wayback_machine_url.count("/status/") == 2 and "twitter.com" not in original_tweet_url @@ -168,9 +179,11 @@ def semicolon_parser(string: str) -> str: """ Replaces semicolons in a string with %3B. - :param string: The string to replace semicolons in. + Args: + string (str): The string to replace semicolons in. - :returns: The string with semicolons replaced by %3B. + Returns: + The string with semicolons replaced by %3B. """ return "".join("%3B" if c == ";" else c for c in string) @@ -179,13 +192,14 @@ def is_tweet_url(twitter_url: str) -> bool: """ Checks if the provided URL is a Twitter status URL. - This function checks if the provided URL contains "/status/" exactly once, - which is a common pattern in Twitter status URLs. + This function checks if the provided URL contains "/status/" exactly once, which is a common pattern in Twitter status URLs. - :param twitter_url: The URL to check. + Args: + twitter_url (str): The URL to check. - :returns: True if the URL is a Twitter status URL, False otherwise. - """ + Returns: + True if the URL is a Twitter status URL, False otherwise. + """ # noqa: E501 if twitter_url.count("/status/") == 1: return True