import datetime
import os
import re
+from typing import Any, Dict, List, Optional
import pandas as pd
class TweetsExporter:
- """Handles the exporting of parsed archived tweets."""
+ """
+ Class responsible for exporting parsed archived tweets.
- def __init__(self, data, username, field_options):
+ :param data: The parsed archived tweets data.
+ :param username: The username associated with the tweets.
+ :param field_options: The fields to be included in the exported data.
+ """
+
+ def __init__(
+ self, data: Dict[str, List[Any]], username: str, field_options: List[str]
+ ):
self.data = data
self.username = username
self.field_options = field_options
self.dataframe = self._create_dataframe()
@staticmethod
- def _datetime_now():
- """Formats datetime."""
+ def _datetime_now() -> str:
+ """
+ Returns the current datetime, formatted as a string.
+
+ :returns: The current datetime.
+ """
now = datetime.datetime.now()
formatted_now = now.strftime("%Y%m%d%H%M%S")
formatted_now = re.sub(r"\W+", "", formatted_now)
return formatted_now
@staticmethod
- def _transpose_matrix(data, fill_value=None):
+ def _transpose_matrix(
+ data: Dict[str, List[Any]], fill_value: Optional[Any] = None
+ ) -> List[List[Any]]:
"""
- Transposes a matrix, filling in missing values with a specified fill value
- if needed.
+ Transposes a matrix,
+ filling in missing values with a specified fill value if needed.
+
+ :param data: The matrix to be transposed.
+ :param fill_value: The value to fill in missing values with.
+
+ :returns: The transposed matrix.
"""
max_length = max(len(sublist) for sublist in data.values())
return data_transposed
- def _create_dataframe(self):
- """Creates a DataFrame from the transposed data."""
+ def _create_dataframe(self) -> pd.DataFrame:
+ """
+ Creates a DataFrame from the transposed data.
+
+ :returns: The DataFrame representation of the data.
+ """
data_transposed = self._transpose_matrix(self.data)
df = pd.DataFrame(data_transposed, columns=self.field_options)
return df
- def save_to_csv(self):
- """Saves the DataFrame to a CSV file."""
+ def save_to_csv(self) -> None:
+ """
+ Saves the DataFrame to a CSV file.
+ """
csv_file_path = f"{self.filename}.csv"
self.dataframe.to_csv(csv_file_path, index=False)
print(f"Saved to {csv_file_path}")
- def save_to_json(self):
- """Saves the DataFrame to a JSON file."""
+ def save_to_json(self) -> None:
+ """
+ Saves the DataFrame to a JSON file.
+ """
json_file_path = f"{self.filename}.json"
self.dataframe.to_json(json_file_path, orient="records", lines=False)
print(f"Saved to {json_file_path}")
- def save_to_html(self):
- """Saves the DataFrame to an HTML file."""
+ def save_to_html(self) -> None:
+ """
+ Saves the DataFrame to an HTML file.
+ """
json_file_path = f"{self.filename}.json"
if not os.path.exists(json_file_path):
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import unquote
from requests import exceptions
class TwitterEmbed:
- """Handles parsing of tweets using the Twitter Publish service."""
+ """
+ Class responsible for parsing tweets using the Twitter Publish service.
- def __init__(self, tweet_url):
+ :param tweet_url: The URL of the tweet to be parsed.
+ """
+
+ def __init__(self, tweet_url: str):
self.tweet_url = tweet_url
- def embed(self):
- """Parses the archived tweets when they are still available."""
+ def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
+ """
+ Parses the archived tweets when they are still available.
+
+ This function goes through each archived tweet and checks
+ if it is still available.
+ If the tweet is available, it extracts the necessary information
+ and adds it to the respective lists.
+ The function returns a tuple of three lists:
+ - The first list contains the tweet texts.
+ - The second list contains boolean values indicating whether each tweet
+ is still available.
+ - The third list contains the URLs of the tweets.
+
+ :returns: A tuple of three lists containing the tweet texts,
+ availability statuses, and URLs, respectively. If no tweets are available,
+ returns None.
+ """
try:
url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
response = get_response(url=url)
# TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
class JsonParser:
- """Handles parsing of tweets when the mimetype is application/json."""
+ """
+ Class responsible for parsing tweets when the mimetype is application/json.
+
+ :param archived_tweet_url: The URL of the archived tweet to be parsed.
+ """
- def __init__(self, archived_tweet_url):
+ def __init__(self, archived_tweet_url: str):
self.archived_tweet_url = archived_tweet_url
- def parse(self):
- """Parses the archived tweets in JSON format."""
+ def parse(self) -> str:
+ """
+ Parses the archived tweets in JSON format.
+
+ :returns: The parsed tweet text.
+ """
try:
response = get_response(url=self.archived_tweet_url)
class TweetsParser:
- """Handles the overall parsing of archived tweets."""
-
- def __init__(self, archived_tweets_response, username, field_options):
+ """
+ Class responsible for the overall parsing of archived tweets.
+
+ :param archived_tweets_response: The response from the archived tweets.
+ :param username: The username associated with the tweets.
+ :param field_options: The fields to be included in the parsed data.
+ """
+
+ def __init__(
+ self,
+ archived_tweets_response: List[str],
+ username: str,
+ field_options: List[str],
+ ):
self.archived_tweets_response = archived_tweets_response
self.username = username
self.field_options = field_options
self.parsed_tweets = {option: [] for option in self.field_options}
- def _add_field(self, key, value):
+ def _add_field(self, key: str, value: Any) -> None:
"""
Appends a value to a list in the parsed data structure.
- Defines which data will be structured and saved.
+
+ :param key: The key in the parsed data structure.
+ :param value: The value to be appended.
"""
if key in self.parsed_tweets:
self.parsed_tweets[key].append(value)
- def _process_response(self, response):
- """Process the archived tweet's response and add the relevant CDX data."""
+ def _process_response(self, response: List[str]) -> None:
+ """
+ Processes the archived tweet's response and adds the relevant CDX data.
+
+ :param response: The response from the archived tweet.
+ """
tweet_remove_char = unquote(response[2]).replace("’", "")
cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
self._add_field("archived_digest", response[5])
self._add_field("archived_length", response[6])
- def parse(self):
- """Parses the archived tweets CDX data and structures it."""
+ def parse(self) -> Dict[str, List[Any]]:
+ """
+ Parses the archived tweets CDX data and structures it.
+
+ :returns: The parsed tweets data.
+ """
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {
+from typing import Any, Dict, Optional
+
from requests import exceptions
from rich import print as rprint
class WaybackTweets:
- """Requests data from the Wayback CDX Server API and returns it in JSON format."""
+ """
+ Class responsible for requesting data from the Wayback CDX Server API.
+
+ :param username: The username associated with the tweets.
+ :param collapse: The field to collapse duplicate lines on.
+ :param timestamp_from: The timestamp to start retrieving tweets from.
+ :param timestamp_to: The timestamp to stop retrieving tweets at.
+ :param limit: The maximum number of results to return.
+ :param offset: The number of lines to skip in the results.
+ """
- def __init__(self, username, collapse, timestamp_from, timestamp_to, limit, offset):
+ def __init__(
+ self,
+ username: str,
+ collapse: str,
+ timestamp_from: str,
+ timestamp_to: str,
+ limit: int,
+ offset: int,
+ ):
self.username = username
self.collapse = collapse
self.timestamp_from = timestamp_from
self.limit = limit
self.offset = offset
- def get(self):
- """GET request to the Internet Archive's CDX API to retrieve archived tweets."""
+ def get(self) -> Optional[Dict[str, Any]]:
+ """
+ Sends a GET request to the Internet Archive's CDX API
+ to retrieve archived tweets.
+
+ :returns: The response from the CDX API in JSON format, if successful.
+ """
url = "https://web.archive.org/cdx/search/cdx"
params = {
"url": f"https://twitter.com/{self.username}/status/*",
# flake8: noqa: E501
import json
+from typing import Any, Dict, List
class HTMLTweetsVisualizer:
- """Generates an HTML file to visualize the parsed data."""
+ """
+ Class responsible for generating an HTML file to visualize the parsed data.
- def __init__(self, json_file_path, html_file_path, username):
+ :param json_content: The content of the JSON file.
+ :param html_file_path: The path where the HTML file will be saved.
+ :param username: The username associated with the tweets.
+ """
+
+ def __init__(self, json_file_path: str, html_file_path: str, username: str):
self.json_content = self._json_loader(json_file_path)
self.html_file_path = html_file_path
self.username = username
@staticmethod
- def _json_loader(json_file_path):
- """Reads and loads JSON data from a specified file path."""
+ def _json_loader(json_file_path: str) -> List[Dict[str, Any]]:
+ """
+ Reads and loads JSON data from a specified file path.
+
+ :param json_file_path: The path of the JSON file.
+
+ :returns: The content of the JSON file.
+ """
with open(json_file_path, "r", encoding="utf-8") as f:
return json.load(f)
- def generate(self):
- """Generates an HTML file."""
+ def generate(self) -> str:
+ """
+ Generates an HTML string that represents the parsed data.
+
+ :returns: The generated HTML string.
+ """
html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
html += "<style>\n"
return html
- def save(self, html_content):
- """Saves the generated HTML."""
+ def save(self, html_content: str) -> None:
+ """
+ Saves the generated HTML string to a file.
+
+ :param html_content: The HTML string to be saved.
+ """
with open(self.html_file_path, "w", encoding="utf-8") as f:
f.write(html_content)
"""
-Helper functions.
+Module containing utility functions for handling HTTP requests and manipulating URLs.
"""
import re
from datetime import datetime
+from typing import Any, Optional
import click
import requests
from urllib3.util.retry import Retry
-def get_response(url, params=None):
- """Sends a GET request to the specified URL and returns the response."""
+def get_response(
+ url: str, params: Optional[dict] = None
+) -> Optional[requests.Response]:
+ """
+ Sends a GET request to the specified URL and returns the response.
+
+ :param url: The URL to send the GET request to.
+ :param params: The parameters to include in the GET request.
+
+ :returns: The response from the server,
+ if the status code is not in the 400-511 range.
+ If the status code is in the 400-511 range.
+ """
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.3)
adapter = HTTPAdapter(max_retries=retry)
return response
-def clean_tweet_url(tweet_url, username):
+def clean_tweet_url(tweet_url: str, username: str) -> str:
"""
- Converts the tweet to lowercase,
- checks if it contains a tweet URL associated with the username.
- Returns the original tweet URL with correct casing;
- or returns the original tweet.
+ Cleans a tweet URL by ensuring it is associated with the correct username.
+
+ :param tweet_url: The tweet URL to clean.
+ :param username: The username to associate with the tweet URL.
+
+ :returns: The cleaned tweet URL.
"""
tweet_lower = tweet_url.lower()
return tweet_url
-def clean_wayback_machine_url(wayback_machine_url, archived_timestamp, username):
+def clean_wayback_machine_url(
+ wayback_machine_url: str, archived_timestamp: str, username: str
+) -> str:
"""
- Converts the Wayback Machine URL to lowercase,
- checks if it contains a tweet URL associated with the username.
- Returns the original tweet URL with correct casing and archived timestamp;
- otherwise, it returns the original Wayback Machine URL.
+ Cleans a Wayback Machine URL by ensuring it is associated with the correct username
+ and timestamp.
+
+ :param wayback_machine_url: The Wayback Machine URL to clean.
+ :param archived_timestamp: The timestamp to associate with the Wayback Machine URL.
+ :param username: The username to associate with the Wayback Machine URL.
+
+ :returns: The cleaned Wayback Machine URL.
"""
wayback_machine_url = wayback_machine_url.lower()
return wayback_machine_url
-def check_pattern_tweet(tweet_url):
+def check_pattern_tweet(tweet_url: str) -> str:
"""
- Extracts tweet IDs from various types of tweet URLs or tweet-related patterns.
+ Extracts the tweet ID from a tweet URL.
- Reply pattern: /status//
- Link pattern: /status///
- Twimg pattern: /status/https://pbs
+ :param tweet_url: The tweet URL to extract the ID from.
+
+ :returns: The extracted tweet ID.
"""
pattern = re.compile(r'/status/"([^"]+)"')
return tweet_url
-def delete_tweet_pathnames(tweet_url):
- """Removes any pathnames (/photos, /likes, /retweet...) from the tweet URL."""
+def delete_tweet_pathnames(tweet_url: str) -> str:
+ """
+ Removes any pathnames from a tweet URL.
+
+ :param tweet_url: The tweet URL to remove pathnames from.
+
+ :returns: The tweet URL without any pathnames.
+ """
pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
match_username = pattern_username.match(tweet_url)
return tweet_url
-def check_double_status(wayback_machine_url, original_tweet_url):
+def check_double_status(wayback_machine_url: str, original_tweet_url: str) -> bool:
"""
Checks if a Wayback Machine URL contains two occurrences of "/status/"
and if the original tweet does not contain "twitter.com".
- Returns a boolean.
+
+ :param wayback_machine_url: The Wayback Machine URL to check.
+ :param original_tweet_url: The original tweet URL to check.
+
+ :returns: True if the conditions are met, False otherwise.
"""
if (
wayback_machine_url.count("/status/") == 2
return False
-def semicolon_parser(string):
- """Replaces semicolons in a string with %3B."""
+def semicolon_parser(string: str) -> str:
+ """
+ Replaces semicolons in a string with %3B.
+
+ :param string: The string to replace semicolons in.
+
+ :returns: The string with semicolons replaced by %3B.
+ """
return "".join("%3B" if c == ";" else c for c in string)
-def parse_date(ctx=None, param=None, value=None):
+def parse_date(
+ ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None
+) -> Optional[str]:
"""
Parses a date string and returns it in the format "YYYYMMDD".
- This function takes an optional date string as input,
- and if a date string is provided, it parses the date string into a datetime object
- and then formats it in the "YYYYMMDD" format.
-
- Args:
- ctx (None, optional): Necessary when used with the click package.
- Defaults to None.
- param (None, optional): Necessary when used with the click package.
- Defaults to None.
- value (str, optional): A date string in the "YYYYMMDD" format. Defaults to None.
+ :param ctx: Necessary when used with the click package. Defaults to None.
+ :param param: Necessary when used with the click package. Defaults to None.
+ :param value: A date string in the "YYYYMMDD" format. Defaults to None.
- Returns:
- str: The input date string formatted in the "YYYYMMDD" format,
+ :returns: The input date string formatted in the "YYYYMMDD" format,
or None if no date string was provided.
"""
try: