From: Claromes Date: Mon, 17 Jun 2024 01:57:54 +0000 (-0300) Subject: update file name X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=90a8611c2f4ee85ae8647995e3095c0605a97c0e;p=waybacktweets.git update file name --- diff --git a/.gitignore b/.gitignore index 26166db..5df8b23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ *.csv *.json *.html + waybacktweets/__pycache__ waybacktweets/api/__pycache__ -waybacktweets/cli/__pycache__ +waybacktweets/config/__pycache__ +waybacktweets/exceptions/__pycache__ waybacktweets/utils/__pycache__ + docs/_build/ notes.md diff --git a/app/app.py b/app/app.py index b1db36b..b0c6f71 100644 --- a/app/app.py +++ b/app/app.py @@ -3,9 +3,9 @@ import datetime import streamlit as st import streamlit.components.v1 as components -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import JsonParser, TweetsParser -from waybacktweets.api.request_tweets import WaybackTweets +from waybacktweets.api.export import TweetsExporter +from waybacktweets.api.parse import JsonParser, TweetsParser +from waybacktweets.api.request import WaybackTweets from waybacktweets.config.config import config from waybacktweets.exceptions.exceptions import ( ConnectionError, diff --git a/docs/api.rst b/docs/api.rst index ee5ad43..7e7ab9f 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -32,7 +32,7 @@ Exceptions Export --------- -.. automodule:: waybacktweets.api.export_tweets +.. automodule:: waybacktweets.api.export .. autoclass:: TweetsExporter :members: @@ -41,7 +41,7 @@ Export Parse --------- -.. automodule:: waybacktweets.api.parse_tweets +.. automodule:: waybacktweets.api.parse .. autoclass:: TweetsParser :members: @@ -56,7 +56,7 @@ Parse Request --------- -.. automodule:: waybacktweets.api.request_tweets +.. automodule:: waybacktweets.api.request .. autoclass:: WaybackTweets :members: @@ -80,7 +80,7 @@ Utils Visualizer ----------- -.. automodule:: waybacktweets.api.viz_tweets +.. automodule:: waybacktweets.api.visualize .. autoclass:: HTMLTweetsVisualizer :members: diff --git a/docs/contribute.rst b/docs/contribute.rst index 84ed2cb..0191658 100644 --- a/docs/contribute.rst +++ b/docs/contribute.rst @@ -27,10 +27,11 @@ Brief explanation about the code under the Wayback Tweets directory: - ``assets``: Title and logo images - ``docs``: Documentation generated with Sphinx - ``waybacktweets/api``: Main package modules -- ``waybacktweets/cli``: Command line Interface module +- ``waybacktweets/config``: Global configuration module +- ``waybacktweets/exceptions``: Wayback Tweets Exceptions - ``waybacktweets/utils``: Helper functions used in the package Sponsoring ------------ +------------ You can also donate to the project's developer and maintainer, `Claromes `_, via `GitHub Sponsor `_ or if you are interested in sponsoring the project you can contact via email at support at claromes dot com. diff --git a/docs/todo.rst b/docs/todo.rst index cda3ea5..eaced03 100644 --- a/docs/todo.rst +++ b/docs/todo.rst @@ -5,9 +5,7 @@ TODO -|uncheck| Code: JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting (`Planned for v1.1`) - -|uncheck| Docs: Add tutorial on how to save Tweet via command line (`Planned for v1.1`) +|uncheck| Code: JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting (`Planned for v1.1`) |uncheck| Code: Download images when tweet URL has extensions like JPG or PNG (`Planned for v1.2`) diff --git a/waybacktweets/__init__.py b/waybacktweets/__init__.py index 75b5ae4..f8333d8 100644 --- a/waybacktweets/__init__.py +++ b/waybacktweets/__init__.py @@ -1,8 +1,8 @@ # flake8: noqa: F401 -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import JsonParser, TweetsParser, TwitterEmbed -from waybacktweets.api.request_tweets import WaybackTweets -from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer +from waybacktweets.api.export import TweetsExporter +from waybacktweets.api.parse import JsonParser, TweetsParser, TwitterEmbed +from waybacktweets.api.request import WaybackTweets +from waybacktweets.api.visualize import HTMLTweetsVisualizer __version__ = "1.0" diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index 74fb140..6039477 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -8,9 +8,9 @@ from typing import Any, Optional import click from rich import print as rprint -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import TweetsParser -from waybacktweets.api.request_tweets import WaybackTweets +from waybacktweets.api.export import TweetsExporter +from waybacktweets.api.parse import TweetsParser +from waybacktweets.api.request import WaybackTweets from waybacktweets.config.config import config diff --git a/waybacktweets/api/export.py b/waybacktweets/api/export.py new file mode 100644 index 0000000..7751679 --- /dev/null +++ b/waybacktweets/api/export.py @@ -0,0 +1,117 @@ +""" +Exports the parsed archived tweets. +""" + +import datetime +import os +import re +from typing import Any, Dict, List, Optional + +import pandas as pd + +from waybacktweets.api.visualize import HTMLTweetsVisualizer + + +class TweetsExporter: + """ + Class responsible for exporting parsed archived tweets. + + :param data: The parsed archived tweets data. + :param username: The username associated with the tweets. + :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". + """ # noqa: E501 + + def __init__( + self, data: Dict[str, List[Any]], username: str, field_options: List[str] + ): + self.data = data + self.username = username + self.field_options = field_options + self.formatted_datetime = self._datetime_now() + self.filename = f"{self.username}_tweets_{self.formatted_datetime}" + self.dataframe = self._create_dataframe() + + @staticmethod + def _datetime_now() -> str: + """ + Returns the current datetime, formatted as a string. + + :returns: The current datetime. + """ + now = datetime.datetime.now() + formatted_now = now.strftime("%Y%m%d%H%M%S") + formatted_now = re.sub(r"\W+", "", formatted_now) + + return formatted_now + + @staticmethod + def _transpose_matrix( + data: Dict[str, List[Any]], fill_value: Optional[Any] = None + ) -> List[List[Any]]: + """ + Transposes a matrix, + filling in missing values with a specified fill value if needed. + + :param data: The matrix to be transposed. + :param fill_value: The value to fill in missing values with. + + :returns: The transposed matrix. + """ + max_length = max(len(sublist) for sublist in data.values()) + + filled_data = { + key: value + [fill_value] * (max_length - len(value)) + for key, value in data.items() + } + + data_transposed = [list(row) for row in zip(*filled_data.values())] + + return data_transposed + + def _create_dataframe(self) -> pd.DataFrame: + """ + Creates a DataFrame from the transposed data. + + :returns: The DataFrame representation of the data. + """ + data_transposed = self._transpose_matrix(self.data) + + df = pd.DataFrame(data_transposed, columns=self.field_options) + + return df + + def save_to_csv(self) -> None: + """ + Saves the DataFrame to a CSV file. + """ + csv_file_path = f"{self.filename}.csv" + self.dataframe.to_csv(csv_file_path, index=False) + + print(f"Saved to {csv_file_path}") + + def save_to_json(self) -> None: + """ + Saves the DataFrame to a JSON file. + """ + json_file_path = f"{self.filename}.json" + self.dataframe.to_json(json_file_path, orient="records", lines=False) + + print(f"Saved to {json_file_path}") + + def save_to_html(self) -> None: + """ + Saves the DataFrame to an HTML file. + """ + json_file_path = f"{self.filename}.json" + + if not os.path.exists(json_file_path): + self.save_to_json() + + html_file_path = f"{self.filename}.html" + + html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username) + + html_content = html.generate() + html.save(html_content) + + print(f"Saved to {html_file_path}") diff --git a/waybacktweets/api/export_tweets.py b/waybacktweets/api/export_tweets.py deleted file mode 100644 index a6daf41..0000000 --- a/waybacktweets/api/export_tweets.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Exports the parsed archived tweets. -""" - -import datetime -import os -import re -from typing import Any, Dict, List, Optional - -import pandas as pd - -from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer - - -class TweetsExporter: - """ - Class responsible for exporting parsed archived tweets. - - :param data: The parsed archived tweets data. - :param username: The username associated with the tweets. - :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length". - """ # noqa: E501 - - def __init__( - self, data: Dict[str, List[Any]], username: str, field_options: List[str] - ): - self.data = data - self.username = username - self.field_options = field_options - self.formatted_datetime = self._datetime_now() - self.filename = f"{self.username}_tweets_{self.formatted_datetime}" - self.dataframe = self._create_dataframe() - - @staticmethod - def _datetime_now() -> str: - """ - Returns the current datetime, formatted as a string. - - :returns: The current datetime. - """ - now = datetime.datetime.now() - formatted_now = now.strftime("%Y%m%d%H%M%S") - formatted_now = re.sub(r"\W+", "", formatted_now) - - return formatted_now - - @staticmethod - def _transpose_matrix( - data: Dict[str, List[Any]], fill_value: Optional[Any] = None - ) -> List[List[Any]]: - """ - Transposes a matrix, - filling in missing values with a specified fill value if needed. - - :param data: The matrix to be transposed. - :param fill_value: The value to fill in missing values with. - - :returns: The transposed matrix. - """ - max_length = max(len(sublist) for sublist in data.values()) - - filled_data = { - key: value + [fill_value] * (max_length - len(value)) - for key, value in data.items() - } - - data_transposed = [list(row) for row in zip(*filled_data.values())] - - return data_transposed - - def _create_dataframe(self) -> pd.DataFrame: - """ - Creates a DataFrame from the transposed data. - - :returns: The DataFrame representation of the data. - """ - data_transposed = self._transpose_matrix(self.data) - - df = pd.DataFrame(data_transposed, columns=self.field_options) - - return df - - def save_to_csv(self) -> None: - """ - Saves the DataFrame to a CSV file. - """ - csv_file_path = f"{self.filename}.csv" - self.dataframe.to_csv(csv_file_path, index=False) - - print(f"Saved to {csv_file_path}") - - def save_to_json(self) -> None: - """ - Saves the DataFrame to a JSON file. - """ - json_file_path = f"{self.filename}.json" - self.dataframe.to_json(json_file_path, orient="records", lines=False) - - print(f"Saved to {json_file_path}") - - def save_to_html(self) -> None: - """ - Saves the DataFrame to an HTML file. - """ - json_file_path = f"{self.filename}.json" - - if not os.path.exists(json_file_path): - self.save_to_json() - - html_file_path = f"{self.filename}.html" - - html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username) - - html_content = html.generate() - html.save(html_content) - - print(f"Saved to {html_file_path}") diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py new file mode 100644 index 0000000..519696c --- /dev/null +++ b/waybacktweets/api/parse.py @@ -0,0 +1,281 @@ +""" +Parses the returned data from the Wayback CDX Server API. +""" + +import re +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import nullcontext +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import unquote + +from rich import print as rprint +from rich.progress import Progress + +from waybacktweets.config.config import config +from waybacktweets.exceptions.exceptions import ( + ConnectionError, + GetResponseError, + HTTPError, +) +from waybacktweets.utils.utils import ( + check_double_status, + check_pattern_tweet, + clean_tweet_url, + delete_tweet_pathnames, + get_response, + is_tweet_url, + semicolon_parser, +) + + +class TwitterEmbed: + """ + Class responsible for parsing tweets using the Twitter Publish service. + + :param tweet_url: The URL of the tweet to be parsed. + """ + + def __init__(self, tweet_url: str): + self.tweet_url = tweet_url + + def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]: + """ + Parses the archived tweets when they are still available. + + This function goes through each archived tweet and checks + if it is still available. + If the tweet is available, it extracts the necessary information + and adds it to the respective lists. + The function returns a tuple of three lists: + - The first list contains the tweet texts. + - The second list contains boolean values indicating whether each tweet + is still available. + - The third list contains the URLs of the tweets. + + :returns: A tuple of three lists containing the tweet texts, + availability statuses, and URLs, respectively. If no tweets are available, + returns None. + """ + try: + url = f"https://publish.twitter.com/oembed?url={self.tweet_url}" + response = get_response(url=url) + if response: + json_response = response.json() + html = json_response["html"] + author_name = json_response["author_name"] + + regex = re.compile( + r'