review docstrings and add typing

author Claromes <claromes@hey.com>

Fri, 14 Jun 2024 21:02:13 +0000 (18:02 -0300)

committer Claromes <claromes@hey.com>

Fri, 14 Jun 2024 21:02:13 +0000 (18:02 -0300)
author Claromes <claromes@hey.com>
Fri, 14 Jun 2024 21:02:13 +0000 (18:02 -0300)
committer Claromes <claromes@hey.com>
Fri, 14 Jun 2024 21:02:13 +0000 (18:02 -0300)
diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py

index 6f38351dc3d0f574ba0f70294a993e074539773b..cb789f6f2a16a6705ff370641ff37391a1cda974 100644 (file)
--- a/waybacktweets/api/export_tweets.py
+++ b/waybacktweets/api/export_tweets.py
@@ -1,6 +1,7 @@
  import datetime
  import os
  import re
+from typing import Any, Dict, List, Optional
  
  import pandas as pd
  
@@ -8,9 +9,17 @@ from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer
  
  
  class TweetsExporter:
-    """Handles the exporting of parsed archived tweets."""
+    """
+    Class responsible for exporting parsed archived tweets.
  
-    def __init__(self, data, username, field_options):
+    :param data: The parsed archived tweets data.
+    :param username: The username associated with the tweets.
+    :param field_options: The fields to be included in the exported data.
+    """
+
+    def __init__(
+        self, data: Dict[str, List[Any]], username: str, field_options: List[str]
+    ):
          self.data = data
          self.username = username
          self.field_options = field_options
@@ -19,8 +28,12 @@ class TweetsExporter:
          self.dataframe = self._create_dataframe()
  
      @staticmethod
-    def _datetime_now():
-        """Formats datetime."""
+    def _datetime_now() -> str:
+        """
+        Returns the current datetime, formatted as a string.
+
+        :returns: The current datetime.
+        """
          now = datetime.datetime.now()
          formatted_now = now.strftime("%Y%m%d%H%M%S")
          formatted_now = re.sub(r"\W+", "", formatted_now)
@@ -28,10 +41,17 @@ class TweetsExporter:
          return formatted_now
  
      @staticmethod
-    def _transpose_matrix(data, fill_value=None):
+    def _transpose_matrix(
+        data: Dict[str, List[Any]], fill_value: Optional[Any] = None
+    ) -> List[List[Any]]:
          """
-        Transposes a matrix, filling in missing values with a specified fill value
-        if needed.
+        Transposes a matrix,
+        filling in missing values with a specified fill value if needed.
+
+        :param data: The matrix to be transposed.
+        :param fill_value: The value to fill in missing values with.
+
+        :returns: The transposed matrix.
          """
          max_length = max(len(sublist) for sublist in data.values())
  
@@ -44,30 +64,40 @@ class TweetsExporter:
  
          return data_transposed
  
-    def _create_dataframe(self):
-        """Creates a DataFrame from the transposed data."""
+    def _create_dataframe(self) -> pd.DataFrame:
+        """
+        Creates a DataFrame from the transposed data.
+
+        :returns: The DataFrame representation of the data.
+        """
          data_transposed = self._transpose_matrix(self.data)
  
          df = pd.DataFrame(data_transposed, columns=self.field_options)
  
          return df
  
-    def save_to_csv(self):
-        """Saves the DataFrame to a CSV file."""
+    def save_to_csv(self) -> None:
+        """
+        Saves the DataFrame to a CSV file.
+        """
          csv_file_path = f"{self.filename}.csv"
          self.dataframe.to_csv(csv_file_path, index=False)
  
          print(f"Saved to {csv_file_path}")
  
-    def save_to_json(self):
-        """Saves the DataFrame to a JSON file."""
+    def save_to_json(self) -> None:
+        """
+        Saves the DataFrame to a JSON file.
+        """
          json_file_path = f"{self.filename}.json"
          self.dataframe.to_json(json_file_path, orient="records", lines=False)
  
          print(f"Saved to {json_file_path}")
  
-    def save_to_html(self):
-        """Saves the DataFrame to an HTML file."""
+    def save_to_html(self) -> None:
+        """
+        Saves the DataFrame to an HTML file.
+        """
          json_file_path = f"{self.filename}.json"
  
          if not os.path.exists(json_file_path):
diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py

index 465287fac9c292ce788c81712a846b4a6d9765c5..5c8adcb589ff5add13a42bcdadbe1452b4a99772 100644 (file)
--- a/waybacktweets/api/parse_tweets.py
+++ b/waybacktweets/api/parse_tweets.py
@@ -1,5 +1,6 @@
  import re
  from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
  from urllib.parse import unquote
  
  from requests import exceptions
@@ -17,13 +18,33 @@ from waybacktweets.utils.utils import (
  
  
  class TwitterEmbed:
-    """Handles parsing of tweets using the Twitter Publish service."""
+    """
+    Class responsible for parsing tweets using the Twitter Publish service.
  
-    def __init__(self, tweet_url):
+    :param tweet_url: The URL of the tweet to be parsed.
+    """
+
+    def __init__(self, tweet_url: str):
          self.tweet_url = tweet_url
  
-    def embed(self):
-        """Parses the archived tweets when they are still available."""
+    def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
+        """
+        Parses the archived tweets when they are still available.
+
+        This function goes through each archived tweet and checks
+        if it is still available.
+        If the tweet is available, it extracts the necessary information
+        and adds it to the respective lists.
+        The function returns a tuple of three lists:
+        - The first list contains the tweet texts.
+        - The second list contains boolean values indicating whether each tweet
+        is still available.
+        - The third list contains the URLs of the tweets.
+
+        :returns: A tuple of three lists containing the tweet texts,
+            availability statuses, and URLs, respectively. If no tweets are available,
+            returns None.
+        """
          try:
              url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
              response = get_response(url=url)
@@ -72,13 +93,21 @@ class TwitterEmbed:
  
  # TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
  class JsonParser:
-    """Handles parsing of tweets when the mimetype is application/json."""
+    """
+    Class responsible for parsing tweets when the mimetype is application/json.
+
+    :param archived_tweet_url: The URL of the archived tweet to be parsed.
+    """
  
-    def __init__(self, archived_tweet_url):
+    def __init__(self, archived_tweet_url: str):
          self.archived_tweet_url = archived_tweet_url
  
-    def parse(self):
-        """Parses the archived tweets in JSON format."""
+    def parse(self) -> str:
+        """
+        Parses the archived tweets in JSON format.
+
+        :returns: The parsed tweet text.
+        """
          try:
              response = get_response(url=self.archived_tweet_url)
  
@@ -109,24 +138,41 @@ class JsonParser:
  
  
  class TweetsParser:
-    """Handles the overall parsing of archived tweets."""
-
-    def __init__(self, archived_tweets_response, username, field_options):
+    """
+    Class responsible for the overall parsing of archived tweets.
+
+    :param archived_tweets_response: The response from the archived tweets.
+    :param username: The username associated with the tweets.
+    :param field_options: The fields to be included in the parsed data.
+    """
+
+    def __init__(
+        self,
+        archived_tweets_response: List[str],
+        username: str,
+        field_options: List[str],
+    ):
          self.archived_tweets_response = archived_tweets_response
          self.username = username
          self.field_options = field_options
          self.parsed_tweets = {option: [] for option in self.field_options}
  
-    def _add_field(self, key, value):
+    def _add_field(self, key: str, value: Any) -> None:
          """
          Appends a value to a list in the parsed data structure.
-        Defines which data will be structured and saved.
+
+        :param key: The key in the parsed data structure.
+        :param value: The value to be appended.
          """
          if key in self.parsed_tweets:
              self.parsed_tweets[key].append(value)
  
-    def _process_response(self, response):
-        """Process the archived tweet's response and add the relevant CDX data."""
+    def _process_response(self, response: List[str]) -> None:
+        """
+        Processes the archived tweet's response and adds the relevant CDX data.
+
+        :param response: The response from the archived tweet.
+        """
          tweet_remove_char = unquote(response[2]).replace("’", "")
          cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
  
@@ -185,8 +231,12 @@ class TweetsParser:
          self._add_field("archived_digest", response[5])
          self._add_field("archived_length", response[6])
  
-    def parse(self):
-        """Parses the archived tweets CDX data and structures it."""
+    def parse(self) -> Dict[str, List[Any]]:
+        """
+        Parses the archived tweets CDX data and structures it.
+
+        :returns: The parsed tweets data.
+        """
          with ThreadPoolExecutor(max_workers=10) as executor:
  
              futures = {
diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py

index 7bb9dd10cb7734a27120fbcb693d85cd5c51d963..baebe3a66340efa72884b8a14986436aee833331 100644 (file)
--- a/waybacktweets/api/request_tweets.py
+++ b/waybacktweets/api/request_tweets.py
@@ -1,3 +1,5 @@
+from typing import Any, Dict, Optional
+
  from requests import exceptions
  from rich import print as rprint
  
@@ -5,9 +7,26 @@ from waybacktweets.utils.utils import get_response
  
  
  class WaybackTweets:
-    """Requests data from the Wayback CDX Server API and returns it in JSON format."""
+    """
+    Class responsible for requesting data from the Wayback CDX Server API.
+
+    :param username: The username associated with the tweets.
+    :param collapse: The field to collapse duplicate lines on.
+    :param timestamp_from: The timestamp to start retrieving tweets from.
+    :param timestamp_to: The timestamp to stop retrieving tweets at.
+    :param limit: The maximum number of results to return.
+    :param offset: The number of lines to skip in the results.
+    """
  
-    def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset):
+    def __init__(
+        self,
+        username: str,
+        collapse: str,
+        timestamp_from: str,
+        timestamp_to: str,
+        limit: int,
+        offset: int,
+    ):
          self.username = username
          self.collapse = collapse
          self.timestamp_from = timestamp_from
@@ -15,8 +34,13 @@ class WaybackTweets:
          self.limit = limit
          self.offset = offset
  
-    def get(self):
-        """GET request to the Internet Archive's CDX API to retrieve archived tweets."""
+    def get(self) -> Optional[Dict[str, Any]]:
+        """
+        Sends a GET request to the Internet Archive's CDX API
+        to retrieve archived tweets.
+
+        :returns: The response from the CDX API in JSON format, if successful.
+        """
          url = "https://web.archive.org/cdx/search/cdx"
          params = {
              "url": f"https://twitter.com/{self.username}/status/*",
diff --git a/waybacktweets/api/viz_tweets.py b/waybacktweets/api/viz_tweets.py

index 229bf86be7a226697de6683ea97865a65e52547f..f5a68cc33a1eaebac72dc54c8e58d7da52364f1b 100644 (file)
--- a/waybacktweets/api/viz_tweets.py
+++ b/waybacktweets/api/viz_tweets.py
@@ -1,23 +1,40 @@
  # flake8: noqa: E501
  import json
+from typing import Any, Dict, List
  
  
  class HTMLTweetsVisualizer:
-    """Generates an HTML file to visualize the parsed data."""
+    """
+    Class responsible for generating an HTML file to visualize the parsed data.
  
-    def __init__(self, json_file_path, html_file_path, username):
+    :param json_content: The content of the JSON file.
+    :param html_file_path: The path where the HTML file will be saved.
+    :param username: The username associated with the tweets.
+    """
+
+    def __init__(self, json_file_path: str, html_file_path: str, username: str):
          self.json_content = self._json_loader(json_file_path)
          self.html_file_path = html_file_path
          self.username = username
  
      @staticmethod
-    def _json_loader(json_file_path):
-        """Reads and loads JSON data from a specified file path."""
+    def _json_loader(json_file_path: str) -> List[Dict[str, Any]]:
+        """
+        Reads and loads JSON data from a specified file path.
+
+        :param json_file_path: The path of the JSON file.
+
+        :returns: The content of the JSON file.
+        """
          with open(json_file_path, "r", encoding="utf-8") as f:
              return json.load(f)
  
-    def generate(self):
-        """Generates an HTML file."""
+    def generate(self) -> str:
+        """
+        Generates an HTML string that represents the parsed data.
+
+        :returns: The generated HTML string.
+        """
  
          html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
          html += "<style>\n"
@@ -95,7 +112,11 @@ class HTMLTweetsVisualizer:
  
          return html
  
-    def save(self, html_content):
-        """Saves the generated HTML."""
+    def save(self, html_content: str) -> None:
+        """
+        Saves the generated HTML string to a file.
+
+        :param html_content: The HTML string to be saved.
+        """
          with open(self.html_file_path, "w", encoding="utf-8") as f:
              f.write(html_content)
diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py

index cfa29bb924f3e894cb460020351392ec842ce040..50ea728f7d6ae44aa9a2e32000479e18dca0da91 100644 (file)
--- a/waybacktweets/utils/utils.py
+++ b/waybacktweets/utils/utils.py
@@ -1,9 +1,10 @@
  """
-Helper functions.
+Module containing utility functions for handling HTTP requests and manipulating URLs.
  """
  
  import re
  from datetime import datetime
+from typing import Any, Optional
  
  import click
  import requests
@@ -11,8 +12,19 @@ from requests.adapters import HTTPAdapter
  from urllib3.util.retry import Retry
  
  
-def get_response(url, params=None):
-    """Sends a GET request to the specified URL and returns the response."""
+def get_response(
+    url: str, params: Optional[dict] = None
+) -> Optional[requests.Response]:
+    """
+    Sends a GET request to the specified URL and returns the response.
+
+    :param url: The URL to send the GET request to.
+    :param params: The parameters to include in the GET request.
+
+    :returns: The response from the server,
+        if the status code is not in the 400-511 range.
+        If the status code is in the 400-511 range.
+    """
      session = requests.Session()
      retry = Retry(connect=3, backoff_factor=0.3)
      adapter = HTTPAdapter(max_retries=retry)
@@ -31,12 +43,14 @@ def get_response(url, params=None):
      return response
  
  
-def clean_tweet_url(tweet_url, username):
+def clean_tweet_url(tweet_url: str, username: str) -> str:
      """
-    Converts the tweet to lowercase,
-    checks if it contains a tweet URL associated with the username.
-    Returns the original tweet URL with correct casing;
-    or returns the original tweet.
+    Cleans a tweet URL by ensuring it is associated with the correct username.
+
+    :param tweet_url: The tweet URL to clean.
+    :param username: The username to associate with the tweet URL.
+
+    :returns: The cleaned tweet URL.
      """
      tweet_lower = tweet_url.lower()
  
@@ -50,12 +64,18 @@ def clean_tweet_url(tweet_url, username):
          return tweet_url
  
  
-def clean_wayback_machine_url(wayback_machine_url, archived_timestamp, username):
+def clean_wayback_machine_url(
+    wayback_machine_url: str, archived_timestamp: str, username: str
+) -> str:
      """
-    Converts the Wayback Machine URL to lowercase,
-    checks if it contains a tweet URL associated with the username.
-    Returns the original tweet URL with correct casing and archived timestamp;
-    otherwise, it returns the original Wayback Machine URL.
+    Cleans a Wayback Machine URL by ensuring it is associated with the correct username
+    and timestamp.
+
+    :param wayback_machine_url: The Wayback Machine URL to clean.
+    :param archived_timestamp: The timestamp to associate with the Wayback Machine URL.
+    :param username: The username to associate with the Wayback Machine URL.
+
+    :returns: The cleaned Wayback Machine URL.
      """
      wayback_machine_url = wayback_machine_url.lower()
  
@@ -68,13 +88,13 @@ def clean_wayback_machine_url(wayback_machine_url, archived_timestamp, username)
          return wayback_machine_url
  
  
-def check_pattern_tweet(tweet_url):
+def check_pattern_tweet(tweet_url: str) -> str:
      """
-    Extracts tweet IDs from various types of tweet URLs or tweet-related patterns.
+    Extracts the tweet ID from a tweet URL.
  
-    Reply pattern: /status//
-    Link pattern:  /status///
-    Twimg pattern: /status/https://pbs
+    :param tweet_url: The tweet URL to extract the ID from.
+
+    :returns: The extracted tweet ID.
      """
      pattern = re.compile(r'/status/"([^"]+)"')
  
@@ -85,8 +105,14 @@ def check_pattern_tweet(tweet_url):
          return tweet_url
  
  
-def delete_tweet_pathnames(tweet_url):
-    """Removes any pathnames (/photos, /likes, /retweet...) from the tweet URL."""
+def delete_tweet_pathnames(tweet_url: str) -> str:
+    """
+    Removes any pathnames from a tweet URL.
+
+    :param tweet_url: The tweet URL to remove pathnames from.
+
+    :returns: The tweet URL without any pathnames.
+    """
      pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
      match_username = pattern_username.match(tweet_url)
  
@@ -101,11 +127,15 @@ def delete_tweet_pathnames(tweet_url):
          return tweet_url
  
  
-def check_double_status(wayback_machine_url, original_tweet_url):
+def check_double_status(wayback_machine_url: str, original_tweet_url: str) -> bool:
      """
      Checks if a Wayback Machine URL contains two occurrences of "/status/"
      and if the original tweet does not contain "twitter.com".
-    Returns a boolean.
+
+    :param wayback_machine_url: The Wayback Machine URL to check.
+    :param original_tweet_url: The original tweet URL to check.
+
+    :returns: True if the conditions are met, False otherwise.
      """
      if (
          wayback_machine_url.count("/status/") == 2
@@ -116,28 +146,28 @@ def check_double_status(wayback_machine_url, original_tweet_url):
      return False
  
  
-def semicolon_parser(string):
-    """Replaces semicolons in a string with %3B."""
+def semicolon_parser(string: str) -> str:
+    """
+    Replaces semicolons in a string with %3B.
+
+    :param string: The string to replace semicolons in.
+
+    :returns: The string with semicolons replaced by %3B.
+    """
      return "".join("%3B" if c == ";" else c for c in string)
  
  
-def parse_date(ctx=None, param=None, value=None):
+def parse_date(
+    ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None
+) -> Optional[str]:
      """
      Parses a date string and returns it in the format "YYYYMMDD".
  
-    This function takes an optional date string as input,
-    and if a date string is provided, it parses the date string into a datetime object
-    and then formats it in the "YYYYMMDD" format.
-
-    Args:
-        ctx (None, optional): Necessary when used with the click package.
-        Defaults to None.
-        param (None, optional): Necessary when used with the click package.
-        Defaults to None.
-        value (str, optional): A date string in the "YYYYMMDD" format. Defaults to None.
+    :param ctx: Necessary when used with the click package. Defaults to None.
+    :param param: Necessary when used with the click package. Defaults to None.
+    :param value: A date string in the "YYYYMMDD" format. Defaults to None.
  
-    Returns:
-        str: The input date string formatted in the "YYYYMMDD" format,
+    :returns: The input date string formatted in the "YYYYMMDD" format,
          or None if no date string was provided.
      """
      try:
author	Claromes <claromes@hey.com>
	Fri, 14 Jun 2024 21:02:13 +0000 (18:02 -0300)
committer	Claromes <claromes@hey.com>
	Fri, 14 Jun 2024 21:02:13 +0000 (18:02 -0300)
waybacktweets/api/export_tweets.py		patch \| blob \| history
waybacktweets/api/parse_tweets.py		patch \| blob \| history
waybacktweets/api/request_tweets.py		patch \| blob \| history
waybacktweets/api/viz_tweets.py		patch \| blob \| history
waybacktweets/utils/utils.py		patch \| blob \| history