From: Claromes Date: Fri, 14 Jun 2024 21:02:13 +0000 (-0300) Subject: review docstrings and add typing X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=e9f38c183dfd6e7b16bcf40fa4987fbb5a4b93ff;p=waybacktweets.git review docstrings and add typing --- diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py index 6f38351..cb789f6 100644 --- a/waybacktweets/api/export_tweets.py +++ b/waybacktweets/api/export_tweets.py @@ -1,6 +1,7 @@ import datetime import os import re +from typing import Any, Dict, List, Optional import pandas as pd @@ -8,9 +9,17 @@ from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer class TweetsExporter: - """Handles the exporting of parsed archived tweets.""" + """ + Class responsible for exporting parsed archived tweets. - def __init__(self, data, username, field_options): + :param data: The parsed archived tweets data. + :param username: The username associated with the tweets. + :param field_options: The fields to be included in the exported data. + """ + + def __init__( + self, data: Dict[str, List[Any]], username: str, field_options: List[str] + ): self.data = data self.username = username self.field_options = field_options @@ -19,8 +28,12 @@ class TweetsExporter: self.dataframe = self._create_dataframe() @staticmethod - def _datetime_now(): - """Formats datetime.""" + def _datetime_now() -> str: + """ + Returns the current datetime, formatted as a string. + + :returns: The current datetime. + """ now = datetime.datetime.now() formatted_now = now.strftime("%Y%m%d%H%M%S") formatted_now = re.sub(r"\W+", "", formatted_now) @@ -28,10 +41,17 @@ class TweetsExporter: return formatted_now @staticmethod - def _transpose_matrix(data, fill_value=None): + def _transpose_matrix( + data: Dict[str, List[Any]], fill_value: Optional[Any] = None + ) -> List[List[Any]]: """ - Transposes a matrix, filling in missing values with a specified fill value - if needed. + Transposes a matrix, + filling in missing values with a specified fill value if needed. + + :param data: The matrix to be transposed. + :param fill_value: The value to fill in missing values with. + + :returns: The transposed matrix. """ max_length = max(len(sublist) for sublist in data.values()) @@ -44,30 +64,40 @@ class TweetsExporter: return data_transposed - def _create_dataframe(self): - """Creates a DataFrame from the transposed data.""" + def _create_dataframe(self) -> pd.DataFrame: + """ + Creates a DataFrame from the transposed data. + + :returns: The DataFrame representation of the data. + """ data_transposed = self._transpose_matrix(self.data) df = pd.DataFrame(data_transposed, columns=self.field_options) return df - def save_to_csv(self): - """Saves the DataFrame to a CSV file.""" + def save_to_csv(self) -> None: + """ + Saves the DataFrame to a CSV file. + """ csv_file_path = f"{self.filename}.csv" self.dataframe.to_csv(csv_file_path, index=False) print(f"Saved to {csv_file_path}") - def save_to_json(self): - """Saves the DataFrame to a JSON file.""" + def save_to_json(self) -> None: + """ + Saves the DataFrame to a JSON file. + """ json_file_path = f"{self.filename}.json" self.dataframe.to_json(json_file_path, orient="records", lines=False) print(f"Saved to {json_file_path}") - def save_to_html(self): - """Saves the DataFrame to an HTML file.""" + def save_to_html(self) -> None: + """ + Saves the DataFrame to an HTML file. + """ json_file_path = f"{self.filename}.json" if not os.path.exists(json_file_path): diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py index 465287f..5c8adcb 100644 --- a/waybacktweets/api/parse_tweets.py +++ b/waybacktweets/api/parse_tweets.py @@ -1,5 +1,6 @@ import re from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any, Dict, List, Optional, Tuple from urllib.parse import unquote from requests import exceptions @@ -17,13 +18,33 @@ from waybacktweets.utils.utils import ( class TwitterEmbed: - """Handles parsing of tweets using the Twitter Publish service.""" + """ + Class responsible for parsing tweets using the Twitter Publish service. - def __init__(self, tweet_url): + :param tweet_url: The URL of the tweet to be parsed. + """ + + def __init__(self, tweet_url: str): self.tweet_url = tweet_url - def embed(self): - """Parses the archived tweets when they are still available.""" + def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]: + """ + Parses the archived tweets when they are still available. + + This function goes through each archived tweet and checks + if it is still available. + If the tweet is available, it extracts the necessary information + and adds it to the respective lists. + The function returns a tuple of three lists: + - The first list contains the tweet texts. + - The second list contains boolean values indicating whether each tweet + is still available. + - The third list contains the URLs of the tweets. + + :returns: A tuple of three lists containing the tweet texts, + availability statuses, and URLs, respectively. If no tweets are available, + returns None. + """ try: url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" response = get_response(url=url) @@ -72,13 +93,21 @@ class TwitterEmbed: # TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501 class JsonParser: - """Handles parsing of tweets when the mimetype is application/json.""" + """ + Class responsible for parsing tweets when the mimetype is application/json. + + :param archived_tweet_url: The URL of the archived tweet to be parsed. + """ - def __init__(self, archived_tweet_url): + def __init__(self, archived_tweet_url: str): self.archived_tweet_url = archived_tweet_url - def parse(self): - """Parses the archived tweets in JSON format.""" + def parse(self) -> str: + """ + Parses the archived tweets in JSON format. + + :returns: The parsed tweet text. + """ try: response = get_response(url=self.archived_tweet_url) @@ -109,24 +138,41 @@ class JsonParser: class TweetsParser: - """Handles the overall parsing of archived tweets.""" - - def __init__(self, archived_tweets_response, username, field_options): + """ + Class responsible for the overall parsing of archived tweets. + + :param archived_tweets_response: The response from the archived tweets. + :param username: The username associated with the tweets. + :param field_options: The fields to be included in the parsed data. + """ + + def __init__( + self, + archived_tweets_response: List[str], + username: str, + field_options: List[str], + ): self.archived_tweets_response = archived_tweets_response self.username = username self.field_options = field_options self.parsed_tweets = {option: [] for option in self.field_options} - def _add_field(self, key, value): + def _add_field(self, key: str, value: Any) -> None: """ Appends a value to a list in the parsed data structure. - Defines which data will be structured and saved. + + :param key: The key in the parsed data structure. + :param value: The value to be appended. """ if key in self.parsed_tweets: self.parsed_tweets[key].append(value) - def _process_response(self, response): - """Process the archived tweet's response and add the relevant CDX data.""" + def _process_response(self, response: List[str]) -> None: + """ + Processes the archived tweet's response and adds the relevant CDX data. + + :param response: The response from the archived tweet. + """ tweet_remove_char = unquote(response[2]).replace("’", "") cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"') @@ -185,8 +231,12 @@ class TweetsParser: self._add_field("archived_digest", response[5]) self._add_field("archived_length", response[6]) - def parse(self): - """Parses the archived tweets CDX data and structures it.""" + def parse(self) -> Dict[str, List[Any]]: + """ + Parses the archived tweets CDX data and structures it. + + :returns: The parsed tweets data. + """ with ThreadPoolExecutor(max_workers=10) as executor: futures = { diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py index 7bb9dd1..baebe3a 100644 --- a/waybacktweets/api/request_tweets.py +++ b/waybacktweets/api/request_tweets.py @@ -1,3 +1,5 @@ +from typing import Any, Dict, Optional + from requests import exceptions from rich import print as rprint @@ -5,9 +7,26 @@ from waybacktweets.utils.utils import get_response class WaybackTweets: - """Requests data from the Wayback CDX Server API and returns it in JSON format.""" + """ + Class responsible for requesting data from the Wayback CDX Server API. + + :param username: The username associated with the tweets. + :param collapse: The field to collapse duplicate lines on. + :param timestamp_from: The timestamp to start retrieving tweets from. + :param timestamp_to: The timestamp to stop retrieving tweets at. + :param limit: The maximum number of results to return. + :param offset: The number of lines to skip in the results. + """ - def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset): + def __init__( + self, + username: str, + collapse: str, + timestamp_from: str, + timestamp_to: str, + limit: int, + offset: int, + ): self.username = username self.collapse = collapse self.timestamp_from = timestamp_from @@ -15,8 +34,13 @@ class WaybackTweets: self.limit = limit self.offset = offset - def get(self): - """GET request to the Internet Archive's CDX API to retrieve archived tweets.""" + def get(self) -> Optional[Dict[str, Any]]: + """ + Sends a GET request to the Internet Archive's CDX API + to retrieve archived tweets. + + :returns: The response from the CDX API in JSON format, if successful. + """ url = "https://web.archive.org/cdx/search/cdx" params = { "url": f"https://twitter.com/{self.username}/status/*", diff --git a/waybacktweets/api/viz_tweets.py b/waybacktweets/api/viz_tweets.py index 229bf86..f5a68cc 100644 --- a/waybacktweets/api/viz_tweets.py +++ b/waybacktweets/api/viz_tweets.py @@ -1,23 +1,40 @@ # flake8: noqa: E501 import json +from typing import Any, Dict, List class HTMLTweetsVisualizer: - """Generates an HTML file to visualize the parsed data.""" + """ + Class responsible for generating an HTML file to visualize the parsed data. - def __init__(self, json_file_path, html_file_path, username): + :param json_content: The content of the JSON file. + :param html_file_path: The path where the HTML file will be saved. + :param username: The username associated with the tweets. + """ + + def __init__(self, json_file_path: str, html_file_path: str, username: str): self.json_content = self._json_loader(json_file_path) self.html_file_path = html_file_path self.username = username @staticmethod - def _json_loader(json_file_path): - """Reads and loads JSON data from a specified file path.""" + def _json_loader(json_file_path: str) -> List[Dict[str, Any]]: + """ + Reads and loads JSON data from a specified file path. + + :param json_file_path: The path of the JSON file. + + :returns: The content of the JSON file. + """ with open(json_file_path, "r", encoding="utf-8") as f: return json.load(f) - def generate(self): - """Generates an HTML file.""" + def generate(self) -> str: + """ + Generates an HTML string that represents the parsed data. + + :returns: The generated HTML string. + """ html = f"\n\n@{self.username} archived tweets\n" html += "