@{self.username} archived tweets

From: Claromes Date: Mon, 17 Jun 2024 01:57:54 +0000 (-0300) Subject: update file name X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=90a8611c2f4ee85ae8647995e3095c0605a97c0e;p=waybacktweets.git update file name --- diff --git a/.gitignore b/.gitignore index 26166db..5df8b23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ *.csv *.json *.html + waybacktweets/__pycache__ waybacktweets/api/__pycache__ -waybacktweets/cli/__pycache__ +waybacktweets/config/__pycache__ +waybacktweets/exceptions/__pycache__ waybacktweets/utils/__pycache__ + docs/_build/ notes.md diff --git a/app/app.py b/app/app.py index b1db36b..b0c6f71 100644 --- a/app/app.py +++ b/app/app.py @@ -3,9 +3,9 @@ import datetime import streamlit as st import streamlit.components.v1 as components -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import JsonParser, TweetsParser -from waybacktweets.api.request_tweets import WaybackTweets +from waybacktweets.api.export import TweetsExporter +from waybacktweets.api.parse import JsonParser, TweetsParser +from waybacktweets.api.request import WaybackTweets from waybacktweets.config.config import config from waybacktweets.exceptions.exceptions import ( ConnectionError, diff --git a/docs/api.rst b/docs/api.rst index ee5ad43..7e7ab9f 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -32,7 +32,7 @@ Exceptions Export --------- -.. automodule:: waybacktweets.api.export_tweets +.. automodule:: waybacktweets.api.export .. autoclass:: TweetsExporter :members: @@ -41,7 +41,7 @@ Export Parse --------- -.. automodule:: waybacktweets.api.parse_tweets +.. automodule:: waybacktweets.api.parse .. autoclass:: TweetsParser :members: @@ -56,7 +56,7 @@ Parse Request --------- -.. automodule:: waybacktweets.api.request_tweets +.. automodule:: waybacktweets.api.request .. autoclass:: WaybackTweets :members: @@ -80,7 +80,7 @@ Utils Visualizer ----------- -.. automodule:: waybacktweets.api.viz_tweets +.. automodule:: waybacktweets.api.visualize .. autoclass:: HTMLTweetsVisualizer :members: diff --git a/docs/contribute.rst b/docs/contribute.rst index 84ed2cb..0191658 100644 --- a/docs/contribute.rst +++ b/docs/contribute.rst @@ -27,10 +27,11 @@ Brief explanation about the code under the Wayback Tweets directory: - ``assets``: Title and logo images - ``docs``: Documentation generated with Sphinx - ``waybacktweets/api``: Main package modules -- ``waybacktweets/cli``: Command line Interface module +- ``waybacktweets/config``: Global configuration module +- ``waybacktweets/exceptions``: Wayback Tweets Exceptions - ``waybacktweets/utils``: Helper functions used in the package Sponsoring ------------ +------------ You can also donate to the project's developer and maintainer, `Claromes `_, via `GitHub Sponsor `_ or if you are interested in sponsoring the project you can contact via email at support at claromes dot com. diff --git a/docs/todo.rst b/docs/todo.rst index cda3ea5..eaced03 100644 --- a/docs/todo.rst +++ b/docs/todo.rst @@ -5,9 +5,7 @@ TODO -|uncheck| Code: JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting (`Planned for v1.1`) - -|uncheck| Docs: Add tutorial on how to save Tweet via command line (`Planned for v1.1`) +|uncheck| Code: JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting (`Planned for v1.1`) |uncheck| Code: Download images when tweet URL has extensions like JPG or PNG (`Planned for v1.2`) diff --git a/waybacktweets/__init__.py b/waybacktweets/__init__.py index 75b5ae4..f8333d8 100644 --- a/waybacktweets/__init__.py +++ b/waybacktweets/__init__.py @@ -1,8 +1,8 @@ # flake8: noqa: F401 -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import JsonParser, TweetsParser, TwitterEmbed -from waybacktweets.api.request_tweets import WaybackTweets -from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer +from waybacktweets.api.export import TweetsExporter +from waybacktweets.api.parse import JsonParser, TweetsParser, TwitterEmbed +from waybacktweets.api.request import WaybackTweets +from waybacktweets.api.visualize import HTMLTweetsVisualizer __version__ = "1.0" diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index 74fb140..6039477 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -8,9 +8,9 @@ from typing import Any, Optional import click from rich import print as rprint -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import TweetsParser -from waybacktweets.api.request_tweets import WaybackTweets +from waybacktweets.api.export import TweetsExporter +from waybacktweets.api.parse import TweetsParser +from waybacktweets.api.request import WaybackTweets from waybacktweets.config.config import config diff --git a/waybacktweets/api/export.py b/waybacktweets/api/export.py new file mode 100644 index 0000000..7751679 --- /dev/null +++ b/waybacktweets/api/export.py @@ -0,0 +1,117 @@ +""" +Exports the parsed archived tweets. +""" + +import datetime +import os +import re +from typing import Any, Dict, List, Optional + +import pandas as pd + +from waybacktweets.api.visualize import HTMLTweetsVisualizer + + +class TweetsExporter: + """ + Class responsible for exporting parsed archived tweets. + + :param data: The parsed archived tweets data. + :param username: The username associated with the tweets. + :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". + """ # noqa: E501 + + def __init__( + self, data: Dict[str, List[Any]], username: str, field_options: List[str] + ): + self.data = data + self.username = username + self.field_options = field_options + self.formatted_datetime = self._datetime_now() + self.filename = f"{self.username}_tweets_{self.formatted_datetime}" + self.dataframe = self._create_dataframe() + + @staticmethod + def _datetime_now() -> str: + """ + Returns the current datetime, formatted as a string. + + :returns: The current datetime. + """ + now = datetime.datetime.now() + formatted_now = now.strftime("%Y%m%d%H%M%S") + formatted_now = re.sub(r"\W+", "", formatted_now) + + return formatted_now + + @staticmethod + def _transpose_matrix( + data: Dict[str, List[Any]], fill_value: Optional[Any] = None + ) -> List[List[Any]]: + """ + Transposes a matrix, + filling in missing values with a specified fill value if needed. + + :param data: The matrix to be transposed. + :param fill_value: The value to fill in missing values with. + + :returns: The transposed matrix. + """ + max_length = max(len(sublist) for sublist in data.values()) + + filled_data = { + key: value + [fill_value] * (max_length - len(value)) + for key, value in data.items() + } + + data_transposed = [list(row) for row in zip(*filled_data.values())] + + return data_transposed + + def _create_dataframe(self) -> pd.DataFrame: + """ + Creates a DataFrame from the transposed data. + + :returns: The DataFrame representation of the data. + """ + data_transposed = self._transpose_matrix(self.data) + + df = pd.DataFrame(data_transposed, columns=self.field_options) + + return df + + def save_to_csv(self) -> None: + """ + Saves the DataFrame to a CSV file. + """ + csv_file_path = f"{self.filename}.csv" + self.dataframe.to_csv(csv_file_path, index=False) + + print(f"Saved to {csv_file_path}") + + def save_to_json(self) -> None: + """ + Saves the DataFrame to a JSON file. + """ + json_file_path = f"{self.filename}.json" + self.dataframe.to_json(json_file_path, orient="records", lines=False) + + print(f"Saved to {json_file_path}") + + def save_to_html(self) -> None: + """ + Saves the DataFrame to an HTML file. + """ + json_file_path = f"{self.filename}.json" + + if not os.path.exists(json_file_path): + self.save_to_json() + + html_file_path = f"{self.filename}.html" + + html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username) + + html_content = html.generate() + html.save(html_content) + + print(f"Saved to {html_file_path}") diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py deleted file mode 100644 index a6daf41..0000000 --- a/waybacktweets/api/export_tweets.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Exports the parsed archived tweets. -""" - -import datetime -import os -import re -from typing import Any, Dict, List, Optional - -import pandas as pd - -from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer - - -class TweetsExporter: - """ - Class responsible for exporting parsed archived tweets. - - :param data: The parsed archived tweets data. - :param username: The username associated with the tweets. - :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". - """ # noqa: E501 - - def __init__( - self, data: Dict[str, List[Any]], username: str, field_options: List[str] - ): - self.data = data - self.username = username - self.field_options = field_options - self.formatted_datetime = self._datetime_now() - self.filename = f"{self.username}_tweets_{self.formatted_datetime}" - self.dataframe = self._create_dataframe() - - @staticmethod - def _datetime_now() -> str: - """ - Returns the current datetime, formatted as a string. - - :returns: The current datetime. - """ - now = datetime.datetime.now() - formatted_now = now.strftime("%Y%m%d%H%M%S") - formatted_now = re.sub(r"\W+", "", formatted_now) - - return formatted_now - - @staticmethod - def _transpose_matrix( - data: Dict[str, List[Any]], fill_value: Optional[Any] = None - ) -> List[List[Any]]: - """ - Transposes a matrix, - filling in missing values with a specified fill value if needed. - - :param data: The matrix to be transposed. - :param fill_value: The value to fill in missing values with. - - :returns: The transposed matrix. - """ - max_length = max(len(sublist) for sublist in data.values()) - - filled_data = { - key: value + [fill_value] * (max_length - len(value)) - for key, value in data.items() - } - - data_transposed = [list(row) for row in zip(*filled_data.values())] - - return data_transposed - - def _create_dataframe(self) -> pd.DataFrame: - """ - Creates a DataFrame from the transposed data. - - :returns: The DataFrame representation of the data. - """ - data_transposed = self._transpose_matrix(self.data) - - df = pd.DataFrame(data_transposed, columns=self.field_options) - - return df - - def save_to_csv(self) -> None: - """ - Saves the DataFrame to a CSV file. - """ - csv_file_path = f"{self.filename}.csv" - self.dataframe.to_csv(csv_file_path, index=False) - - print(f"Saved to {csv_file_path}") - - def save_to_json(self) -> None: - """ - Saves the DataFrame to a JSON file. - """ - json_file_path = f"{self.filename}.json" - self.dataframe.to_json(json_file_path, orient="records", lines=False) - - print(f"Saved to {json_file_path}") - - def save_to_html(self) -> None: - """ - Saves the DataFrame to an HTML file. - """ - json_file_path = f"{self.filename}.json" - - if not os.path.exists(json_file_path): - self.save_to_json() - - html_file_path = f"{self.filename}.html" - - html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username) - - html_content = html.generate() - html.save(html_content) - - print(f"Saved to {html_file_path}") diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py new file mode 100644 index 0000000..519696c --- /dev/null +++ b/waybacktweets/api/parse.py @@ -0,0 +1,281 @@ +""" +Parses the returned data from the Wayback CDX Server API. +""" + +import re +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import nullcontext +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import unquote + +from rich import print as rprint +from rich.progress import Progress + +from waybacktweets.config.config import config +from waybacktweets.exceptions.exceptions import ( + ConnectionError, + GetResponseError, + HTTPError, +) +from waybacktweets.utils.utils import ( + check_double_status, + check_pattern_tweet, + clean_tweet_url, + delete_tweet_pathnames, + get_response, + is_tweet_url, + semicolon_parser, +) + + +class TwitterEmbed: + """ + Class responsible for parsing tweets using the Twitter Publish service. + + :param tweet_url: The URL of the tweet to be parsed. + """ + + def __init__(self, tweet_url: str): + self.tweet_url = tweet_url + + def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]: + """ + Parses the archived tweets when they are still available. + + This function goes through each archived tweet and checks + if it is still available. + If the tweet is available, it extracts the necessary information + and adds it to the respective lists. + The function returns a tuple of three lists: + - The first list contains the tweet texts. + - The second list contains boolean values indicating whether each tweet + is still available. + - The third list contains the URLs of the tweets. + + :returns: A tuple of three lists containing the tweet texts, + availability statuses, and URLs, respectively. If no tweets are available, + returns None. + """ + try: + url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" + response = get_response(url=url) + if response: + json_response = response.json() + html = json_response["html"] + author_name = json_response["author_name"] + + regex = re.compile( + r'

]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa + re.DOTALL, + ) + regex_author = re.compile(r"^(.*?)\s*\(") + + matches_html = regex.findall(html) + + tweet_content = [] + user_info = [] + is_RT = [] + + for match in matches_html: + tweet_content_match = re.sub( + r"]*>|<\/a>", "", match[0].strip() + ).replace("
", "\n") + user_info_match = re.sub( + r"]*>|<\/a>", "", match[1].strip() + ).replace(")", "), ") + match_author = regex_author.search(user_info_match) + author_tweet = match_author.group(1) if match_author else "" + + if tweet_content_match: + tweet_content.append(tweet_content_match) + if user_info_match: + user_info.append(user_info_match) + is_RT.append(author_name != author_tweet) + + return tweet_content, is_RT, user_info + except ConnectionError: + if config.verbose: + rprint("[yellow]Error parsing the tweet, but the CDX data was saved.") + except HTTPError: + if config.verbose: + rprint( + f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501 + ) + except GetResponseError as e: + if config.verbose: + rprint(f"[red]An error occurred: {str(e)}") + + return None + + +class JsonParser: + """ + Class responsible for parsing tweets when the mimetype is application/json.\n + Note: This class is in an experimental phase, but it is currently being + used by the Streamlit Web App. + + :param archived_tweet_url: The URL of the archived tweet to be parsed. + """ + + def __init__(self, archived_tweet_url: str): + self.archived_tweet_url = archived_tweet_url + + def parse(self) -> str: + """ + Parses the archived tweets in JSON format. + + :returns: The parsed tweet text. + """ + try: + response = get_response(url=self.archived_tweet_url) + + if response: + json_data = response.json() + + if "data" in json_data: + return json_data["data"].get("text", json_data["data"]) + + if "retweeted_status" in json_data: + return json_data["retweeted_status"].get( + "text", json_data["retweeted_status"] + ) + + return json_data.get("text", json_data) + except ConnectionError: + if config.verbose: + rprint( + f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501 + ) + except GetResponseError as e: + if config.verbose: + rprint(f"[red]An error occurred: {str(e)}") + + return None + + +class TweetsParser: + """ + Class responsible for the overall parsing of archived tweets. + + :param archived_tweets_response: The response from the archived tweets. + :param username: The username associated with the tweets. + :param field_options: The fields to be included in the parsed data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". + """ # noqa: E501 + + def __init__( + self, + archived_tweets_response: List[str], + username: str, + field_options: List[str], + ): + self.archived_tweets_response = archived_tweets_response + self.username = username + self.field_options = field_options + self.parsed_tweets = {option: [] for option in self.field_options} + + def _add_field(self, key: str, value: Any) -> None: + """ + Appends a value to a list in the parsed data structure. + + :param key: The key in the parsed data structure. + :param value: The value to be appended. + """ + if key in self.parsed_tweets: + self.parsed_tweets[key].append(value) + + def _process_response(self, response: List[str]) -> None: + """ + Processes the archived tweet's response and adds the relevant CDX data. + + :param response: The response from the archived tweet. + """ + tweet_remove_char = unquote(response[2]).replace("â", "") + cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"') + + wayback_machine_url = ( + f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}" + ) + original_tweet = delete_tweet_pathnames( + clean_tweet_url(cleaned_tweet, self.username) + ) + parsed_wayback_machine_url = ( + f"https://web.archive.org/web/{response[1]}/{original_tweet}" + ) + + double_status = check_double_status(wayback_machine_url, original_tweet) + + if double_status: + original_tweet = delete_tweet_pathnames( + f"https://twitter.com/{original_tweet}" + ) + elif "://" not in original_tweet: + original_tweet = delete_tweet_pathnames(f"https://{original_tweet}") + + encoded_tweet = semicolon_parser(response[2]) + encoded_archived_tweet = semicolon_parser(wayback_machine_url) + encoded_parsed_tweet = semicolon_parser(original_tweet) + encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url) + + available_tweet_text = None + available_tweet_is_RT = None + available_tweet_info = None + + is_tweet = is_tweet_url(encoded_tweet) + + if is_tweet: + embed_parser = TwitterEmbed(encoded_tweet) + content = embed_parser.embed() + + if content: + available_tweet_text = semicolon_parser(content[0][0]) + available_tweet_is_RT = content[1][0] + available_tweet_info = semicolon_parser(content[2][0]) + + self._add_field("available_tweet_text", available_tweet_text) + self._add_field("available_tweet_is_RT", available_tweet_is_RT) + self._add_field("available_tweet_info", available_tweet_info) + + self._add_field("archived_urlkey", response[0]) + self._add_field("archived_timestamp", response[1]) + self._add_field("original_tweet_url", encoded_tweet) + self._add_field("archived_tweet_url", encoded_archived_tweet) + self._add_field("parsed_tweet_url", encoded_parsed_tweet) + self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet) + self._add_field("archived_mimetype", response[3]) + self._add_field("archived_statuscode", response[4]) + self._add_field("archived_digest", response[5]) + self._add_field("archived_length", response[6]) + + def parse(self, print_progress=False) -> Dict[str, List[Any]]: + """ + Parses the archived tweets CDX data and structures it. + + :param print_progress: A boolean indicating whether to print progress or not. + + :returns: The parsed tweets data. + """ + with ThreadPoolExecutor(max_workers=10) as executor: + + futures = { + executor.submit(self._process_response, response): response + for response in self.archived_tweets_response[1:] + } + + progress_context = Progress() if print_progress else nullcontext() + with progress_context as progress: + task = None + if print_progress: + task = progress.add_task( + f"Waybacking @{self.username} tweets\n", total=len(futures) + ) + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + rprint(f"[red]{e}") + + if print_progress: + progress.update(task, advance=1) + + return self.parsed_tweets diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py deleted file mode 100644 index 0b6c8d5..0000000 --- a/waybacktweets/api/parse_tweets.py +++ /dev/null @@ -1,294 +0,0 @@ -""" -Parses the returned data from the Wayback CDX Server API. -""" - -import re -from concurrent.futures import ThreadPoolExecutor, as_completed -from contextlib import nullcontext -from typing import Any, Dict, List, Optional, Tuple -from urllib.parse import unquote - -from rich import print as rprint -from rich.progress import Progress - -from waybacktweets.config.config import config -from waybacktweets.exceptions.exceptions import ( - ConnectionError, - GetResponseError, - HTTPError, -) -from waybacktweets.utils.utils import ( - check_double_status, - check_pattern_tweet, - clean_tweet_url, - delete_tweet_pathnames, - get_response, - is_tweet_url, - semicolon_parser, -) - - -class TwitterEmbed: - """ - Class responsible for parsing tweets using the Twitter Publish service. - - :param tweet_url: The URL of the tweet to be parsed. - """ - - def __init__(self, tweet_url: str): - self.tweet_url = tweet_url - - def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]: - """ - Parses the archived tweets when they are still available. - - This function goes through each archived tweet and checks - if it is still available. - If the tweet is available, it extracts the necessary information - and adds it to the respective lists. - The function returns a tuple of three lists: - - The first list contains the tweet texts. - - The second list contains boolean values indicating whether each tweet - is still available. - - The third list contains the URLs of the tweets. - - :returns: A tuple of three lists containing the tweet texts, - availability statuses, and URLs, respectively. If no tweets are available, - returns None. - """ - try: - url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" - response = get_response(url=url) - if response: - json_response = response.json() - html = json_response["html"] - author_name = json_response["author_name"] - - regex = re.compile( - r'
]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa - re.DOTALL, - ) - regex_author = re.compile(r"^(.*?)\s*\(") - - matches_html = regex.findall(html) - - tweet_content = [] - user_info = [] - is_RT = [] - - for match in matches_html: - tweet_content_match = re.sub( - r"]*>|<\/a>", "", match[0].strip() - ).replace("
", "\n") - user_info_match = re.sub( - r"]*>|<\/a>", "", match[1].strip() - ).replace(")", "), ") - match_author = regex_author.search(user_info_match) - author_tweet = match_author.group(1) if match_author else "" - - if tweet_content_match: - tweet_content.append(tweet_content_match) - if user_info_match: - user_info.append(user_info_match) - is_RT.append(author_name != author_tweet) - - return tweet_content, is_RT, user_info - except ConnectionError: - if config.verbose: - rprint("[yellow]Error parsing the tweet, but the CDX data was saved.") - except HTTPError: - if config.verbose: - rprint( - f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501 - ) - except GetResponseError as e: - if config.verbose: - rprint(f"[red]An error occurred: {str(e)}") - - return None - - -# TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501 -class JsonParser: - """ - Class responsible for parsing tweets when the mimetype is application/json.\n - Note: This class is in an experimental phase, but it is currently being - used by the Streamlit Web App. - - :param archived_tweet_url: The URL of the archived tweet to be parsed. - """ - - def __init__(self, archived_tweet_url: str): - self.archived_tweet_url = archived_tweet_url - - def parse(self) -> str: - """ - Parses the archived tweets in JSON format. - - :returns: The parsed tweet text. - """ - try: - response = get_response(url=self.archived_tweet_url) - - if response: - json_data = response.json() - - if "data" in json_data: - return json_data["data"].get("text", json_data["data"]) - - if "retweeted_status" in json_data: - return json_data["retweeted_status"].get( - "text", json_data["retweeted_status"] - ) - - return json_data.get("text", json_data) - except ConnectionError: - if config.verbose: - rprint( - f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501 - ) - except GetResponseError as e: - if config.verbose: - rprint(f"[red]An error occurred: {str(e)}") - - return None - - -class TweetsParser: - """ - Class responsible for the overall parsing of archived tweets. - - :param archived_tweets_response: The response from the archived tweets. - :param username: The username associated with the tweets. - :param field_options: The fields to be included in the parsed data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". - """ # noqa: E501 - - def __init__( - self, - archived_tweets_response: List[str], - username: str, - field_options: List[str], - ): - self.archived_tweets_response = archived_tweets_response - self.username = username - self.field_options = field_options - self.parsed_tweets = {option: [] for option in self.field_options} - - def _add_field(self, key: str, value: Any) -> None: - """ - Appends a value to a list in the parsed data structure. - - :param key: The key in the parsed data structure. - :param value: The value to be appended. - """ - if key in self.parsed_tweets: - self.parsed_tweets[key].append(value) - - def _process_response(self, response: List[str]) -> None: - """ - Processes the archived tweet's response and adds the relevant CDX data. - - :param response: The response from the archived tweet. - """ - tweet_remove_char = unquote(response[2]).replace("â", "") - cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"') - - wayback_machine_url = ( - f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}" - ) - original_tweet = delete_tweet_pathnames( - clean_tweet_url(cleaned_tweet, self.username) - ) - parsed_wayback_machine_url = ( - f"https://web.archive.org/web/{response[1]}/{original_tweet}" - ) - - double_status = check_double_status(wayback_machine_url, original_tweet) - - if double_status: - original_tweet = delete_tweet_pathnames( - f"https://twitter.com/{original_tweet}" - ) - elif "://" not in original_tweet: - original_tweet = delete_tweet_pathnames(f"https://{original_tweet}") - - encoded_tweet = semicolon_parser(response[2]) - encoded_archived_tweet = semicolon_parser(wayback_machine_url) - encoded_parsed_tweet = semicolon_parser(original_tweet) - encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url) - - available_tweet_text = None - available_tweet_is_RT = None - available_tweet_info = None - - is_tweet = is_tweet_url(encoded_tweet) - - if is_tweet: - embed_parser = TwitterEmbed(encoded_tweet) - content = embed_parser.embed() - - if content: - available_tweet_text = semicolon_parser(content[0][0]) - available_tweet_is_RT = content[1][0] - available_tweet_info = semicolon_parser(content[2][0]) - - self._add_field("available_tweet_text", available_tweet_text) - self._add_field("available_tweet_is_RT", available_tweet_is_RT) - self._add_field("available_tweet_info", available_tweet_info) - - # TODO: JSON Issue - # parsed_text_json = "" - - # if response[3] == "application/json": - # json_parser = JsonParser(encoded_parsed_archived_tweet) - # text_json = json_parser.parse() - - # if text_json: - # parsed_text_json = semicolon_parser(text_json) - - # self._add_field("parsed_tweet_text_mimetype_json", parsed_text_json) - - self._add_field("archived_urlkey", response[0]) - self._add_field("archived_timestamp", response[1]) - self._add_field("original_tweet_url", encoded_tweet) - self._add_field("archived_tweet_url", encoded_archived_tweet) - self._add_field("parsed_tweet_url", encoded_parsed_tweet) - self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet) - self._add_field("archived_mimetype", response[3]) - self._add_field("archived_statuscode", response[4]) - self._add_field("archived_digest", response[5]) - self._add_field("archived_length", response[6]) - - def parse(self, print_progress=False) -> Dict[str, List[Any]]: - """ - Parses the archived tweets CDX data and structures it. - - :param print_progress: A boolean indicating whether to print progress or not. - - :returns: The parsed tweets data. - """ - with ThreadPoolExecutor(max_workers=10) as executor: - - futures = { - executor.submit(self._process_response, response): response - for response in self.archived_tweets_response[1:] - } - - progress_context = Progress() if print_progress else nullcontext() - with progress_context as progress: - task = None - if print_progress: - task = progress.add_task( - f"Waybacking @{self.username} tweets\n", total=len(futures) - ) - - for future in as_completed(futures): - try: - future.result() - except Exception as e: - rprint(f"[red]{e}") - - if print_progress: - progress.update(task, advance=1) - - return self.parsed_tweets diff --git a/waybacktweets/api/request.py b/waybacktweets/api/request.py new file mode 100644 index 0000000..d7d37a1 --- /dev/null +++ b/waybacktweets/api/request.py @@ -0,0 +1,108 @@ +""" +Requests data from the Wayback Machine API. +""" + +from typing import Any, Dict, Optional + +from rich import print as rprint + +from waybacktweets.config.config import config +from waybacktweets.exceptions.exceptions import ( + ConnectionError, + EmptyResponseError, + GetResponseError, + HTTPError, + ReadTimeoutError, +) +from waybacktweets.utils.utils import get_response + + +class WaybackTweets: + """ + Class responsible for requesting data from the Wayback CDX Server API. + + :param username: The username associated with the tweets. + :param collapse: The field to collapse duplicate lines on. + :param timestamp_from: The timestamp to start retrieving tweets from. + :param timestamp_to: The timestamp to stop retrieving tweets at. + :param limit: The maximum number of results to return. + :param offset: The number of lines to skip in the results. + :param matchType: Results matching a certain prefix, a certain host or all subdomains. + """ # noqa: E501 + + def __init__( + self, + username: str, + collapse: str = None, + timestamp_from: str = None, + timestamp_to: str = None, + limit: int = None, + offset: int = None, + matchtype: str = None, + ): + self.username = username + self.collapse = collapse + self.timestamp_from = timestamp_from + self.timestamp_to = timestamp_to + self.limit = limit + self.offset = offset + self.matchtype = matchtype + + def get(self) -> Optional[Dict[str, Any]]: + """ + Sends a GET request to the Internet Archive's CDX API + to retrieve archived tweets. + + :returns: The response from the CDX API in JSON format, if successful. + """ + url = "https://web.archive.org/cdx/search/cdx" + + status_pathname = "status/*" + if self.matchtype: + status_pathname = "" + + params = { + "url": f"https://twitter.com/{self.username}/{status_pathname}", + "output": "json", + } + + if self.collapse: + params["collapse"] = self.collapse + + if self.timestamp_from: + params["from"] = self.timestamp_from + + if self.timestamp_to: + params["to"] = self.timestamp_to + + if self.limit: + params["limit"] = self.limit + + if self.offset: + params["offset"] = self.offset + + if self.matchtype: + params["matchType"] = self.matchtype + + try: + response = get_response(url=url, params=params) + return response.json() + except ReadTimeoutError: + if config.verbose: + rprint("[red]Connection to web.archive.org timed out.") + except ConnectionError: + if config.verbose: + rprint( + "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501 + ) + except HTTPError as e: + if config.verbose: + rprint(f"[red]HTTP error occurred: {str(e)}") + except EmptyResponseError: + if config.verbose: + rprint("[red]No data was saved due to an empty response.") + except GetResponseError as e: + if config.verbose: + rprint(f"[red]An error occurred: {str(e)}") + + return None diff --git a/waybacktweets/api/request_tweets.py b/waybacktweets/api/request_tweets.py deleted file mode 100644 index d7d37a1..0000000 --- a/waybacktweets/api/request_tweets.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -Requests data from the Wayback Machine API. -""" - -from typing import Any, Dict, Optional - -from rich import print as rprint - -from waybacktweets.config.config import config -from waybacktweets.exceptions.exceptions import ( - ConnectionError, - EmptyResponseError, - GetResponseError, - HTTPError, - ReadTimeoutError, -) -from waybacktweets.utils.utils import get_response - - -class WaybackTweets: - """ - Class responsible for requesting data from the Wayback CDX Server API. - - :param username: The username associated with the tweets. - :param collapse: The field to collapse duplicate lines on. - :param timestamp_from: The timestamp to start retrieving tweets from. - :param timestamp_to: The timestamp to stop retrieving tweets at. - :param limit: The maximum number of results to return. - :param offset: The number of lines to skip in the results. - :param matchType: Results matching a certain prefix, a certain host or all subdomains. - """ # noqa: E501 - - def __init__( - self, - username: str, - collapse: str = None, - timestamp_from: str = None, - timestamp_to: str = None, - limit: int = None, - offset: int = None, - matchtype: str = None, - ): - self.username = username - self.collapse = collapse - self.timestamp_from = timestamp_from - self.timestamp_to = timestamp_to - self.limit = limit - self.offset = offset - self.matchtype = matchtype - - def get(self) -> Optional[Dict[str, Any]]: - """ - Sends a GET request to the Internet Archive's CDX API - to retrieve archived tweets. - - :returns: The response from the CDX API in JSON format, if successful. - """ - url = "https://web.archive.org/cdx/search/cdx" - - status_pathname = "status/*" - if self.matchtype: - status_pathname = "" - - params = { - "url": f"https://twitter.com/{self.username}/{status_pathname}", - "output": "json", - } - - if self.collapse: - params["collapse"] = self.collapse - - if self.timestamp_from: - params["from"] = self.timestamp_from - - if self.timestamp_to: - params["to"] = self.timestamp_to - - if self.limit: - params["limit"] = self.limit - - if self.offset: - params["offset"] = self.offset - - if self.matchtype: - params["matchType"] = self.matchtype - - try: - response = get_response(url=url, params=params) - return response.json() - except ReadTimeoutError: - if config.verbose: - rprint("[red]Connection to web.archive.org timed out.") - except ConnectionError: - if config.verbose: - rprint( - "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501 - ) - except HTTPError as e: - if config.verbose: - rprint(f"[red]HTTP error occurred: {str(e)}") - except EmptyResponseError: - if config.verbose: - rprint("[red]No data was saved due to an empty response.") - except GetResponseError as e: - if config.verbose: - rprint(f"[red]An error occurred: {str(e)}") - - return None diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py new file mode 100644 index 0000000..ef13aac --- /dev/null +++ b/waybacktweets/api/visualize.py @@ -0,0 +1,126 @@ +# flake8: noqa: E501 +""" +Generates an HTML file to visualize the parsed data. +""" + +import json +from typing import Any, Dict, List + + +class HTMLTweetsVisualizer: + """ + Class responsible for generating an HTML file to visualize the parsed data. + + :param json_content: The content of the JSON file. + :param html_file_path: The path where the HTML file will be saved. + :param username: The username associated with the tweets. + """ + + def __init__(self, json_file_path: str, html_file_path: str, username: str): + self.json_content = self._json_loader(json_file_path) + self.html_file_path = html_file_path + self.username = username + + @staticmethod + def _json_loader(json_file_path: str) -> List[Dict[str, Any]]: + """ + Reads and loads JSON data from a specified file path. + + :param json_file_path: The path of the JSON file. + + :returns: The content of the JSON file. + """ + with open(json_file_path, "r", encoding="utf-8") as f: + return json.load(f) + + def generate(self) -> str: + """ + Generates an HTML string that represents the parsed data. + + :returns: The generated HTML string. + """ + + html = f"\n\n@{self.username} archived tweets\n" + html += "\n" + html += "\n\n" + html += f"
@{self.username} archived tweets
\n" + html += '
\n' + + for tweet in self.json_content: + html += '
\n' + + # TODO: JSON Issue + # if ( + # ( + # tweet["archived_mimetype"] != "application/json" + # and not tweet["parsed_tweet_text_mimetype_json"] + # ) + # and not tweet["available_tweet_text"] + # ) or ( + # ( + # tweet["archived_mimetype"] == "application/json" + # and not tweet["parsed_tweet_text_mimetype_json"] + # ) + # and not tweet["available_tweet_text"] + # ): + if ( + tweet["archived_mimetype"] != "application/json" + and not tweet["available_tweet_text"] + ): + html += f'\n' + + html += f'
Original Tweetâ Â· \n' + html += f'Parsed Tweetâ Â· \n' + html += f'Archived Tweetâ Â· \n' + html += f'Parsed Archived Tweetâ
\n' + + if tweet["available_tweet_text"]: + html += "
\n" + html += f'
Available Tweet Content: {tweet["available_tweet_text"]}
\n' + html += f'
Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}
\n' + html += f'
Available Tweet Username: {tweet["available_tweet_info"]}
\n' + + # TODO: JSON Issue + # if ( + # tweet["archived_mimetype"] == "application/json" + # and tweet["parsed_tweet_text_mimetype_json"] + # ) and not tweet["available_tweet_text"]: + # html += f'
Parsed Tweet Text (application/json): {tweet["parsed_tweet_text_mimetype_json"]}
\n' + + html += "
\n" + html += f'
Archived URL Key: {tweet["archived_urlkey"]}
\n' + html += f'
Archived Timestamp: {tweet["archived_timestamp"]}
\n' + html += f'
Archived mimetype: {tweet["archived_mimetype"]}
\n' + html += f'
Archived Statuscode: {tweet["archived_statuscode"]}
\n' + html += ( + f'
Archived Digest: {tweet["archived_digest"]}
\n' + ) + html += ( + f'
Archived Length: {tweet["archived_length"]}
\n' + ) + html += "
\n" + + html += "
\n" + html += '
generated by Wayback Tweetsâ
\n' + html += "\n" + + return html + + def save(self, html_content: str) -> None: + """ + Saves the generated HTML string to a file. + + :param html_content: The HTML string to be saved. + """ + with open(self.html_file_path, "w", encoding="utf-8") as f: + f.write(html_content) diff --git a/waybacktweets/api/viz_tweets.py b/waybacktweets/api/viz_tweets.py deleted file mode 100644 index ef13aac..0000000 --- a/waybacktweets/api/viz_tweets.py +++ /dev/null @@ -1,126 +0,0 @@ -# flake8: noqa: E501 -""" -Generates an HTML file to visualize the parsed data. -""" - -import json -from typing import Any, Dict, List - - -class HTMLTweetsVisualizer: - """ - Class responsible for generating an HTML file to visualize the parsed data. - - :param json_content: The content of the JSON file. - :param html_file_path: The path where the HTML file will be saved. - :param username: The username associated with the tweets. - """ - - def __init__(self, json_file_path: str, html_file_path: str, username: str): - self.json_content = self._json_loader(json_file_path) - self.html_file_path = html_file_path - self.username = username - - @staticmethod - def _json_loader(json_file_path: str) -> List[Dict[str, Any]]: - """ - Reads and loads JSON data from a specified file path. - - :param json_file_path: The path of the JSON file. - - :returns: The content of the JSON file. - """ - with open(json_file_path, "r", encoding="utf-8") as f: - return json.load(f) - - def generate(self) -> str: - """ - Generates an HTML string that represents the parsed data. - - :returns: The generated HTML string. - """ - - html = f"\n\n@{self.username} archived tweets\n" - html += "\n" - html += "\n\n" - html += f"
@{self.username} archived tweets
\n" - html += '
\n' - - for tweet in self.json_content: - html += '
\n' - - # TODO: JSON Issue - # if ( - # ( - # tweet["archived_mimetype"] != "application/json" - # and not tweet["parsed_tweet_text_mimetype_json"] - # ) - # and not tweet["available_tweet_text"] - # ) or ( - # ( - # tweet["archived_mimetype"] == "application/json" - # and not tweet["parsed_tweet_text_mimetype_json"] - # ) - # and not tweet["available_tweet_text"] - # ): - if ( - tweet["archived_mimetype"] != "application/json" - and not tweet["available_tweet_text"] - ): - html += f'\n' - - html += f'
Original Tweetâ Â· \n' - html += f'Parsed Tweetâ Â· \n' - html += f'Archived Tweetâ Â· \n' - html += f'Parsed Archived Tweetâ
\n' - - if tweet["available_tweet_text"]: - html += "
\n" - html += f'
Available Tweet Content: {tweet["available_tweet_text"]}
\n' - html += f'
Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}
\n' - html += f'
Available Tweet Username: {tweet["available_tweet_info"]}
\n' - - # TODO: JSON Issue - # if ( - # tweet["archived_mimetype"] == "application/json" - # and tweet["parsed_tweet_text_mimetype_json"] - # ) and not tweet["available_tweet_text"]: - # html += f'
Parsed Tweet Text (application/json): {tweet["parsed_tweet_text_mimetype_json"]}
\n' - - html += "
\n" - html += f'
Archived URL Key: {tweet["archived_urlkey"]}
\n' - html += f'
Archived Timestamp: {tweet["archived_timestamp"]}
\n' - html += f'
Archived mimetype: {tweet["archived_mimetype"]}
\n' - html += f'
Archived Statuscode: {tweet["archived_statuscode"]}
\n' - html += ( - f'
Archived Digest: {tweet["archived_digest"]}
\n' - ) - html += ( - f'
Archived Length: {tweet["archived_length"]}
\n' - ) - html += "
\n" - - html += "
\n" - html += '
generated by Wayback Tweetsâ
\n' - html += "\n" - - return html - - def save(self, html_content: str) -> None: - """ - Saves the generated HTML string to a file. - - :param html_content: The HTML string to be saved. - """ - with open(self.html_file_path, "w", encoding="utf-8") as f: - f.write(html_content) diff --git a/waybacktweets/config/__pycache__/__init__.cpython-311.pyc b/waybacktweets/config/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 9b453bc..0000000 Binary files a/waybacktweets/config/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/waybacktweets/config/__pycache__/config.cpython-311.pyc b/waybacktweets/config/__pycache__/config.cpython-311.pyc deleted file mode 100644 index 2bf4595..0000000 Binary files a/waybacktweets/config/__pycache__/config.cpython-311.pyc and /dev/null differ diff --git a/waybacktweets/config/config.py b/waybacktweets/config/config.py index eb6e6dd..5d1ab67 100644 --- a/waybacktweets/config/config.py +++ b/waybacktweets/config/config.py @@ -10,7 +10,7 @@ class _Config: config = _Config() """ -Configuration settings.. +Configuration settings. .. attribute:: verbose diff --git a/waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc b/waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 8d74104..0000000 Binary files a/waybacktweets/exceptions/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc b/waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc deleted file mode 100644 index 5ade7da..0000000 Binary files a/waybacktweets/exceptions/__pycache__/exceptions.cpython-311.pyc and /dev/null differ

@{self.username} archived tweets

generated by Wayback Tweetsâ

@{self.username} archived tweets

generated by Wayback Tweetsâ

generated by Wayback Tweetsâ

generated by Wayback Tweetsâ