*.csv
*.json
*.html
+
waybacktweets/__pycache__
waybacktweets/api/__pycache__
-waybacktweets/cli/__pycache__
+waybacktweets/config/__pycache__
+waybacktweets/exceptions/__pycache__
waybacktweets/utils/__pycache__
+
docs/_build/
notes.md
import streamlit as st
import streamlit.components.v1 as components
-from waybacktweets.api.export_tweets import TweetsExporter
-from waybacktweets.api.parse_tweets import JsonParser, TweetsParser
-from waybacktweets.api.request_tweets import WaybackTweets
+from waybacktweets.api.export import TweetsExporter
+from waybacktweets.api.parse import JsonParser, TweetsParser
+from waybacktweets.api.request import WaybackTweets
from waybacktweets.config.config import config
from waybacktweets.exceptions.exceptions import (
ConnectionError,
Export
---------
-.. automodule:: waybacktweets.api.export_tweets
+.. automodule:: waybacktweets.api.export
.. autoclass:: TweetsExporter
:members:
Parse
---------
-.. automodule:: waybacktweets.api.parse_tweets
+.. automodule:: waybacktweets.api.parse
.. autoclass:: TweetsParser
:members:
Request
---------
-.. automodule:: waybacktweets.api.request_tweets
+.. automodule:: waybacktweets.api.request
.. autoclass:: WaybackTweets
:members:
Visualizer
-----------
-.. automodule:: waybacktweets.api.viz_tweets
+.. automodule:: waybacktweets.api.visualize
.. autoclass:: HTMLTweetsVisualizer
:members:
- ``assets``: Title and logo images
- ``docs``: Documentation generated with Sphinx
- ``waybacktweets/api``: Main package modules
-- ``waybacktweets/cli``: Command line Interface module
+- ``waybacktweets/config``: Global configuration module
+- ``waybacktweets/exceptions``: Wayback Tweets Exceptions
- ``waybacktweets/utils``: Helper functions used in the package
Sponsoring
------------
+------------
You can also donate to the project's developer and maintainer, `Claromes <https://claromes.com>`_, via `GitHub Sponsor <https://github.com/sponsors/claromes>`_ or if you are interested in sponsoring the project you can contact via email at support at claromes dot com.
<input type="checkbox">
-|uncheck| Code: JSON Issue: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse_tweets.py:73``), and avoid rate limiting (`Planned for v1.1`)
-
-|uncheck| Docs: Add tutorial on how to save Tweet via command line (`Planned for v1.1`)
+|uncheck| Code: JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting (`Planned for v1.1`)
|uncheck| Code: Download images when tweet URL has extensions like JPG or PNG (`Planned for v1.2`)
# flake8: noqa: F401
-from waybacktweets.api.export_tweets import TweetsExporter
-from waybacktweets.api.parse_tweets import JsonParser, TweetsParser, TwitterEmbed
-from waybacktweets.api.request_tweets import WaybackTweets
-from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer
+from waybacktweets.api.export import TweetsExporter
+from waybacktweets.api.parse import JsonParser, TweetsParser, TwitterEmbed
+from waybacktweets.api.request import WaybackTweets
+from waybacktweets.api.visualize import HTMLTweetsVisualizer
__version__ = "1.0"
import click
from rich import print as rprint
-from waybacktweets.api.export_tweets import TweetsExporter
-from waybacktweets.api.parse_tweets import TweetsParser
-from waybacktweets.api.request_tweets import WaybackTweets
+from waybacktweets.api.export import TweetsExporter
+from waybacktweets.api.parse import TweetsParser
+from waybacktweets.api.request import WaybackTweets
from waybacktweets.config.config import config
--- /dev/null
+"""
+Exports the parsed archived tweets.
+"""
+
+import datetime
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+from waybacktweets.api.visualize import HTMLTweetsVisualizer
+
+
+class TweetsExporter:
+ """
+ Class responsible for exporting parsed archived tweets.
+
+ :param data: The parsed archived tweets data.
+ :param username: The username associated with the tweets.
+ :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length".
+ """ # noqa: E501
+
+ def __init__(
+ self, data: Dict[str, List[Any]], username: str, field_options: List[str]
+ ):
+ self.data = data
+ self.username = username
+ self.field_options = field_options
+ self.formatted_datetime = self._datetime_now()
+ self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
+ self.dataframe = self._create_dataframe()
+
+ @staticmethod
+ def _datetime_now() -> str:
+ """
+ Returns the current datetime, formatted as a string.
+
+ :returns: The current datetime.
+ """
+ now = datetime.datetime.now()
+ formatted_now = now.strftime("%Y%m%d%H%M%S")
+ formatted_now = re.sub(r"\W+", "", formatted_now)
+
+ return formatted_now
+
+ @staticmethod
+ def _transpose_matrix(
+ data: Dict[str, List[Any]], fill_value: Optional[Any] = None
+ ) -> List[List[Any]]:
+ """
+ Transposes a matrix,
+ filling in missing values with a specified fill value if needed.
+
+ :param data: The matrix to be transposed.
+ :param fill_value: The value to fill in missing values with.
+
+ :returns: The transposed matrix.
+ """
+ max_length = max(len(sublist) for sublist in data.values())
+
+ filled_data = {
+ key: value + [fill_value] * (max_length - len(value))
+ for key, value in data.items()
+ }
+
+ data_transposed = [list(row) for row in zip(*filled_data.values())]
+
+ return data_transposed
+
+ def _create_dataframe(self) -> pd.DataFrame:
+ """
+ Creates a DataFrame from the transposed data.
+
+ :returns: The DataFrame representation of the data.
+ """
+ data_transposed = self._transpose_matrix(self.data)
+
+ df = pd.DataFrame(data_transposed, columns=self.field_options)
+
+ return df
+
+ def save_to_csv(self) -> None:
+ """
+ Saves the DataFrame to a CSV file.
+ """
+ csv_file_path = f"{self.filename}.csv"
+ self.dataframe.to_csv(csv_file_path, index=False)
+
+ print(f"Saved to {csv_file_path}")
+
+ def save_to_json(self) -> None:
+ """
+ Saves the DataFrame to a JSON file.
+ """
+ json_file_path = f"{self.filename}.json"
+ self.dataframe.to_json(json_file_path, orient="records", lines=False)
+
+ print(f"Saved to {json_file_path}")
+
+ def save_to_html(self) -> None:
+ """
+ Saves the DataFrame to an HTML file.
+ """
+ json_file_path = f"{self.filename}.json"
+
+ if not os.path.exists(json_file_path):
+ self.save_to_json()
+
+ html_file_path = f"{self.filename}.html"
+
+ html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username)
+
+ html_content = html.generate()
+ html.save(html_content)
+
+ print(f"Saved to {html_file_path}")
+++ /dev/null
-"""
-Exports the parsed archived tweets.
-"""
-
-import datetime
-import os
-import re
-from typing import Any, Dict, List, Optional
-
-import pandas as pd
-
-from waybacktweets.api.viz_tweets import HTMLTweetsVisualizer
-
-
-class TweetsExporter:
- """
- Class responsible for exporting parsed archived tweets.
-
- :param data: The parsed archived tweets data.
- :param username: The username associated with the tweets.
- :param field_options: The fields to be included in the exported data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length".
- """ # noqa: E501
-
- def __init__(
- self, data: Dict[str, List[Any]], username: str, field_options: List[str]
- ):
- self.data = data
- self.username = username
- self.field_options = field_options
- self.formatted_datetime = self._datetime_now()
- self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
- self.dataframe = self._create_dataframe()
-
- @staticmethod
- def _datetime_now() -> str:
- """
- Returns the current datetime, formatted as a string.
-
- :returns: The current datetime.
- """
- now = datetime.datetime.now()
- formatted_now = now.strftime("%Y%m%d%H%M%S")
- formatted_now = re.sub(r"\W+", "", formatted_now)
-
- return formatted_now
-
- @staticmethod
- def _transpose_matrix(
- data: Dict[str, List[Any]], fill_value: Optional[Any] = None
- ) -> List[List[Any]]:
- """
- Transposes a matrix,
- filling in missing values with a specified fill value if needed.
-
- :param data: The matrix to be transposed.
- :param fill_value: The value to fill in missing values with.
-
- :returns: The transposed matrix.
- """
- max_length = max(len(sublist) for sublist in data.values())
-
- filled_data = {
- key: value + [fill_value] * (max_length - len(value))
- for key, value in data.items()
- }
-
- data_transposed = [list(row) for row in zip(*filled_data.values())]
-
- return data_transposed
-
- def _create_dataframe(self) -> pd.DataFrame:
- """
- Creates a DataFrame from the transposed data.
-
- :returns: The DataFrame representation of the data.
- """
- data_transposed = self._transpose_matrix(self.data)
-
- df = pd.DataFrame(data_transposed, columns=self.field_options)
-
- return df
-
- def save_to_csv(self) -> None:
- """
- Saves the DataFrame to a CSV file.
- """
- csv_file_path = f"{self.filename}.csv"
- self.dataframe.to_csv(csv_file_path, index=False)
-
- print(f"Saved to {csv_file_path}")
-
- def save_to_json(self) -> None:
- """
- Saves the DataFrame to a JSON file.
- """
- json_file_path = f"{self.filename}.json"
- self.dataframe.to_json(json_file_path, orient="records", lines=False)
-
- print(f"Saved to {json_file_path}")
-
- def save_to_html(self) -> None:
- """
- Saves the DataFrame to an HTML file.
- """
- json_file_path = f"{self.filename}.json"
-
- if not os.path.exists(json_file_path):
- self.save_to_json()
-
- html_file_path = f"{self.filename}.html"
-
- html = HTMLTweetsVisualizer(json_file_path, html_file_path, self.username)
-
- html_content = html.generate()
- html.save(html_content)
-
- print(f"Saved to {html_file_path}")
--- /dev/null
+"""
+Parses the returned data from the Wayback CDX Server API.
+"""
+
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import nullcontext
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import unquote
+
+from rich import print as rprint
+from rich.progress import Progress
+
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ GetResponseError,
+ HTTPError,
+)
+from waybacktweets.utils.utils import (
+ check_double_status,
+ check_pattern_tweet,
+ clean_tweet_url,
+ delete_tweet_pathnames,
+ get_response,
+ is_tweet_url,
+ semicolon_parser,
+)
+
+
+class TwitterEmbed:
+ """
+ Class responsible for parsing tweets using the Twitter Publish service.
+
+ :param tweet_url: The URL of the tweet to be parsed.
+ """
+
+ def __init__(self, tweet_url: str):
+ self.tweet_url = tweet_url
+
+ def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
+ """
+ Parses the archived tweets when they are still available.
+
+ This function goes through each archived tweet and checks
+ if it is still available.
+ If the tweet is available, it extracts the necessary information
+ and adds it to the respective lists.
+ The function returns a tuple of three lists:
+ - The first list contains the tweet texts.
+ - The second list contains boolean values indicating whether each tweet
+ is still available.
+ - The third list contains the URLs of the tweets.
+
+ :returns: A tuple of three lists containing the tweet texts,
+ availability statuses, and URLs, respectively. If no tweets are available,
+ returns None.
+ """
+ try:
+ url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
+ response = get_response(url=url)
+ if response:
+ json_response = response.json()
+ html = json_response["html"]
+ author_name = json_response["author_name"]
+
+ regex = re.compile(
+ r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa
+ re.DOTALL,
+ )
+ regex_author = re.compile(r"^(.*?)\s*\(")
+
+ matches_html = regex.findall(html)
+
+ tweet_content = []
+ user_info = []
+ is_RT = []
+
+ for match in matches_html:
+ tweet_content_match = re.sub(
+ r"<a[^>]*>|<\/a>", "", match[0].strip()
+ ).replace("<br>", "\n")
+ user_info_match = re.sub(
+ r"<a[^>]*>|<\/a>", "", match[1].strip()
+ ).replace(")", "), ")
+ match_author = regex_author.search(user_info_match)
+ author_tweet = match_author.group(1) if match_author else ""
+
+ if tweet_content_match:
+ tweet_content.append(tweet_content_match)
+ if user_info_match:
+ user_info.append(user_info_match)
+ is_RT.append(author_name != author_tweet)
+
+ return tweet_content, is_RT, user_info
+ except ConnectionError:
+ if config.verbose:
+ rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
+ except HTTPError:
+ if config.verbose:
+ rprint(
+ f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501
+ )
+ except GetResponseError as e:
+ if config.verbose:
+ rprint(f"[red]An error occurred: {str(e)}")
+
+ return None
+
+
+class JsonParser:
+ """
+ Class responsible for parsing tweets when the mimetype is application/json.\n
+ Note: This class is in an experimental phase, but it is currently being
+ used by the Streamlit Web App.
+
+ :param archived_tweet_url: The URL of the archived tweet to be parsed.
+ """
+
+ def __init__(self, archived_tweet_url: str):
+ self.archived_tweet_url = archived_tweet_url
+
+ def parse(self) -> str:
+ """
+ Parses the archived tweets in JSON format.
+
+ :returns: The parsed tweet text.
+ """
+ try:
+ response = get_response(url=self.archived_tweet_url)
+
+ if response:
+ json_data = response.json()
+
+ if "data" in json_data:
+ return json_data["data"].get("text", json_data["data"])
+
+ if "retweeted_status" in json_data:
+ return json_data["retweeted_status"].get(
+ "text", json_data["retweeted_status"]
+ )
+
+ return json_data.get("text", json_data)
+ except ConnectionError:
+ if config.verbose:
+ rprint(
+ f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
+ )
+ except GetResponseError as e:
+ if config.verbose:
+ rprint(f"[red]An error occurred: {str(e)}")
+
+ return None
+
+
+class TweetsParser:
+ """
+ Class responsible for the overall parsing of archived tweets.
+
+ :param archived_tweets_response: The response from the archived tweets.
+ :param username: The username associated with the tweets.
+ :param field_options: The fields to be included in the parsed data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length".
+ """ # noqa: E501
+
+ def __init__(
+ self,
+ archived_tweets_response: List[str],
+ username: str,
+ field_options: List[str],
+ ):
+ self.archived_tweets_response = archived_tweets_response
+ self.username = username
+ self.field_options = field_options
+ self.parsed_tweets = {option: [] for option in self.field_options}
+
+ def _add_field(self, key: str, value: Any) -> None:
+ """
+ Appends a value to a list in the parsed data structure.
+
+ :param key: The key in the parsed data structure.
+ :param value: The value to be appended.
+ """
+ if key in self.parsed_tweets:
+ self.parsed_tweets[key].append(value)
+
+ def _process_response(self, response: List[str]) -> None:
+ """
+ Processes the archived tweet's response and adds the relevant CDX data.
+
+ :param response: The response from the archived tweet.
+ """
+ tweet_remove_char = unquote(response[2]).replace("’", "")
+ cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
+
+ wayback_machine_url = (
+ f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
+ )
+ original_tweet = delete_tweet_pathnames(
+ clean_tweet_url(cleaned_tweet, self.username)
+ )
+ parsed_wayback_machine_url = (
+ f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+ )
+
+ double_status = check_double_status(wayback_machine_url, original_tweet)
+
+ if double_status:
+ original_tweet = delete_tweet_pathnames(
+ f"https://twitter.com/{original_tweet}"
+ )
+ elif "://" not in original_tweet:
+ original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
+
+ encoded_tweet = semicolon_parser(response[2])
+ encoded_archived_tweet = semicolon_parser(wayback_machine_url)
+ encoded_parsed_tweet = semicolon_parser(original_tweet)
+ encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+
+ available_tweet_text = None
+ available_tweet_is_RT = None
+ available_tweet_info = None
+
+ is_tweet = is_tweet_url(encoded_tweet)
+
+ if is_tweet:
+ embed_parser = TwitterEmbed(encoded_tweet)
+ content = embed_parser.embed()
+
+ if content:
+ available_tweet_text = semicolon_parser(content[0][0])
+ available_tweet_is_RT = content[1][0]
+ available_tweet_info = semicolon_parser(content[2][0])
+
+ self._add_field("available_tweet_text", available_tweet_text)
+ self._add_field("available_tweet_is_RT", available_tweet_is_RT)
+ self._add_field("available_tweet_info", available_tweet_info)
+
+ self._add_field("archived_urlkey", response[0])
+ self._add_field("archived_timestamp", response[1])
+ self._add_field("original_tweet_url", encoded_tweet)
+ self._add_field("archived_tweet_url", encoded_archived_tweet)
+ self._add_field("parsed_tweet_url", encoded_parsed_tweet)
+ self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+ self._add_field("archived_mimetype", response[3])
+ self._add_field("archived_statuscode", response[4])
+ self._add_field("archived_digest", response[5])
+ self._add_field("archived_length", response[6])
+
+ def parse(self, print_progress=False) -> Dict[str, List[Any]]:
+ """
+ Parses the archived tweets CDX data and structures it.
+
+ :param print_progress: A boolean indicating whether to print progress or not.
+
+ :returns: The parsed tweets data.
+ """
+ with ThreadPoolExecutor(max_workers=10) as executor:
+
+ futures = {
+ executor.submit(self._process_response, response): response
+ for response in self.archived_tweets_response[1:]
+ }
+
+ progress_context = Progress() if print_progress else nullcontext()
+ with progress_context as progress:
+ task = None
+ if print_progress:
+ task = progress.add_task(
+ f"Waybacking @{self.username} tweets\n", total=len(futures)
+ )
+
+ for future in as_completed(futures):
+ try:
+ future.result()
+ except Exception as e:
+ rprint(f"[red]{e}")
+
+ if print_progress:
+ progress.update(task, advance=1)
+
+ return self.parsed_tweets
+++ /dev/null
-"""
-Parses the returned data from the Wayback CDX Server API.
-"""
-
-import re
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from contextlib import nullcontext
-from typing import Any, Dict, List, Optional, Tuple
-from urllib.parse import unquote
-
-from rich import print as rprint
-from rich.progress import Progress
-
-from waybacktweets.config.config import config
-from waybacktweets.exceptions.exceptions import (
- ConnectionError,
- GetResponseError,
- HTTPError,
-)
-from waybacktweets.utils.utils import (
- check_double_status,
- check_pattern_tweet,
- clean_tweet_url,
- delete_tweet_pathnames,
- get_response,
- is_tweet_url,
- semicolon_parser,
-)
-
-
-class TwitterEmbed:
- """
- Class responsible for parsing tweets using the Twitter Publish service.
-
- :param tweet_url: The URL of the tweet to be parsed.
- """
-
- def __init__(self, tweet_url: str):
- self.tweet_url = tweet_url
-
- def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
- """
- Parses the archived tweets when they are still available.
-
- This function goes through each archived tweet and checks
- if it is still available.
- If the tweet is available, it extracts the necessary information
- and adds it to the respective lists.
- The function returns a tuple of three lists:
- - The first list contains the tweet texts.
- - The second list contains boolean values indicating whether each tweet
- is still available.
- - The third list contains the URLs of the tweets.
-
- :returns: A tuple of three lists containing the tweet texts,
- availability statuses, and URLs, respectively. If no tweets are available,
- returns None.
- """
- try:
- url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
- response = get_response(url=url)
- if response:
- json_response = response.json()
- html = json_response["html"]
- author_name = json_response["author_name"]
-
- regex = re.compile(
- r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa
- re.DOTALL,
- )
- regex_author = re.compile(r"^(.*?)\s*\(")
-
- matches_html = regex.findall(html)
-
- tweet_content = []
- user_info = []
- is_RT = []
-
- for match in matches_html:
- tweet_content_match = re.sub(
- r"<a[^>]*>|<\/a>", "", match[0].strip()
- ).replace("<br>", "\n")
- user_info_match = re.sub(
- r"<a[^>]*>|<\/a>", "", match[1].strip()
- ).replace(")", "), ")
- match_author = regex_author.search(user_info_match)
- author_tweet = match_author.group(1) if match_author else ""
-
- if tweet_content_match:
- tweet_content.append(tweet_content_match)
- if user_info_match:
- user_info.append(user_info_match)
- is_RT.append(author_name != author_tweet)
-
- return tweet_content, is_RT, user_info
- except ConnectionError:
- if config.verbose:
- rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
- except HTTPError:
- if config.verbose:
- rprint(
- f"[yellow]{self.tweet_url} not available on the user's account, but the CDX data was saved." # noqa: E501
- )
- except GetResponseError as e:
- if config.verbose:
- rprint(f"[red]An error occurred: {str(e)}")
-
- return None
-
-
-# TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
-class JsonParser:
- """
- Class responsible for parsing tweets when the mimetype is application/json.\n
- Note: This class is in an experimental phase, but it is currently being
- used by the Streamlit Web App.
-
- :param archived_tweet_url: The URL of the archived tweet to be parsed.
- """
-
- def __init__(self, archived_tweet_url: str):
- self.archived_tweet_url = archived_tweet_url
-
- def parse(self) -> str:
- """
- Parses the archived tweets in JSON format.
-
- :returns: The parsed tweet text.
- """
- try:
- response = get_response(url=self.archived_tweet_url)
-
- if response:
- json_data = response.json()
-
- if "data" in json_data:
- return json_data["data"].get("text", json_data["data"])
-
- if "retweeted_status" in json_data:
- return json_data["retweeted_status"].get(
- "text", json_data["retweeted_status"]
- )
-
- return json_data.get("text", json_data)
- except ConnectionError:
- if config.verbose:
- rprint(
- f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
- )
- except GetResponseError as e:
- if config.verbose:
- rprint(f"[red]An error occurred: {str(e)}")
-
- return None
-
-
-class TweetsParser:
- """
- Class responsible for the overall parsing of archived tweets.
-
- :param archived_tweets_response: The response from the archived tweets.
- :param username: The username associated with the tweets.
- :param field_options: The fields to be included in the parsed data. Options include "archived_urlkey", "archived_timestamp", "original_tweet_url", "archived_tweet_url", "parsed_tweet_url", "parsed_archived_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", "archived_mimetype", "archived_statuscode", "archived_digest", "archived_length".
- """ # noqa: E501
-
- def __init__(
- self,
- archived_tweets_response: List[str],
- username: str,
- field_options: List[str],
- ):
- self.archived_tweets_response = archived_tweets_response
- self.username = username
- self.field_options = field_options
- self.parsed_tweets = {option: [] for option in self.field_options}
-
- def _add_field(self, key: str, value: Any) -> None:
- """
- Appends a value to a list in the parsed data structure.
-
- :param key: The key in the parsed data structure.
- :param value: The value to be appended.
- """
- if key in self.parsed_tweets:
- self.parsed_tweets[key].append(value)
-
- def _process_response(self, response: List[str]) -> None:
- """
- Processes the archived tweet's response and adds the relevant CDX data.
-
- :param response: The response from the archived tweet.
- """
- tweet_remove_char = unquote(response[2]).replace("’", "")
- cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
-
- wayback_machine_url = (
- f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
- )
- original_tweet = delete_tweet_pathnames(
- clean_tweet_url(cleaned_tweet, self.username)
- )
- parsed_wayback_machine_url = (
- f"https://web.archive.org/web/{response[1]}/{original_tweet}"
- )
-
- double_status = check_double_status(wayback_machine_url, original_tweet)
-
- if double_status:
- original_tweet = delete_tweet_pathnames(
- f"https://twitter.com/{original_tweet}"
- )
- elif "://" not in original_tweet:
- original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
-
- encoded_tweet = semicolon_parser(response[2])
- encoded_archived_tweet = semicolon_parser(wayback_machine_url)
- encoded_parsed_tweet = semicolon_parser(original_tweet)
- encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
-
- available_tweet_text = None
- available_tweet_is_RT = None
- available_tweet_info = None
-
- is_tweet = is_tweet_url(encoded_tweet)
-
- if is_tweet:
- embed_parser = TwitterEmbed(encoded_tweet)
- content = embed_parser.embed()
-
- if content:
- available_tweet_text = semicolon_parser(content[0][0])
- available_tweet_is_RT = content[1][0]
- available_tweet_info = semicolon_parser(content[2][0])
-
- self._add_field("available_tweet_text", available_tweet_text)
- self._add_field("available_tweet_is_RT", available_tweet_is_RT)
- self._add_field("available_tweet_info", available_tweet_info)
-
- # TODO: JSON Issue
- # parsed_text_json = ""
-
- # if response[3] == "application/json":
- # json_parser = JsonParser(encoded_parsed_archived_tweet)
- # text_json = json_parser.parse()
-
- # if text_json:
- # parsed_text_json = semicolon_parser(text_json)
-
- # self._add_field("parsed_tweet_text_mimetype_json", parsed_text_json)
-
- self._add_field("archived_urlkey", response[0])
- self._add_field("archived_timestamp", response[1])
- self._add_field("original_tweet_url", encoded_tweet)
- self._add_field("archived_tweet_url", encoded_archived_tweet)
- self._add_field("parsed_tweet_url", encoded_parsed_tweet)
- self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
- self._add_field("archived_mimetype", response[3])
- self._add_field("archived_statuscode", response[4])
- self._add_field("archived_digest", response[5])
- self._add_field("archived_length", response[6])
-
- def parse(self, print_progress=False) -> Dict[str, List[Any]]:
- """
- Parses the archived tweets CDX data and structures it.
-
- :param print_progress: A boolean indicating whether to print progress or not.
-
- :returns: The parsed tweets data.
- """
- with ThreadPoolExecutor(max_workers=10) as executor:
-
- futures = {
- executor.submit(self._process_response, response): response
- for response in self.archived_tweets_response[1:]
- }
-
- progress_context = Progress() if print_progress else nullcontext()
- with progress_context as progress:
- task = None
- if print_progress:
- task = progress.add_task(
- f"Waybacking @{self.username} tweets\n", total=len(futures)
- )
-
- for future in as_completed(futures):
- try:
- future.result()
- except Exception as e:
- rprint(f"[red]{e}")
-
- if print_progress:
- progress.update(task, advance=1)
-
- return self.parsed_tweets
--- /dev/null
+"""
+Requests data from the Wayback Machine API.
+"""
+
+from typing import Any, Dict, Optional
+
+from rich import print as rprint
+
+from waybacktweets.config.config import config
+from waybacktweets.exceptions.exceptions import (
+ ConnectionError,
+ EmptyResponseError,
+ GetResponseError,
+ HTTPError,
+ ReadTimeoutError,
+)
+from waybacktweets.utils.utils import get_response
+
+
+class WaybackTweets:
+ """
+ Class responsible for requesting data from the Wayback CDX Server API.
+
+ :param username: The username associated with the tweets.
+ :param collapse: The field to collapse duplicate lines on.
+ :param timestamp_from: The timestamp to start retrieving tweets from.
+ :param timestamp_to: The timestamp to stop retrieving tweets at.
+ :param limit: The maximum number of results to return.
+ :param offset: The number of lines to skip in the results.
+ :param matchType: Results matching a certain prefix, a certain host or all subdomains.
+ """ # noqa: E501
+
+ def __init__(
+ self,
+ username: str,
+ collapse: str = None,
+ timestamp_from: str = None,
+ timestamp_to: str = None,
+ limit: int = None,
+ offset: int = None,
+ matchtype: str = None,
+ ):
+ self.username = username
+ self.collapse = collapse
+ self.timestamp_from = timestamp_from
+ self.timestamp_to = timestamp_to
+ self.limit = limit
+ self.offset = offset
+ self.matchtype = matchtype
+
+ def get(self) -> Optional[Dict[str, Any]]:
+ """
+ Sends a GET request to the Internet Archive's CDX API
+ to retrieve archived tweets.
+
+ :returns: The response from the CDX API in JSON format, if successful.
+ """
+ url = "https://web.archive.org/cdx/search/cdx"
+
+ status_pathname = "status/*"
+ if self.matchtype:
+ status_pathname = ""
+
+ params = {
+ "url": f"https://twitter.com/{self.username}/{status_pathname}",
+ "output": "json",
+ }
+
+ if self.collapse:
+ params["collapse"] = self.collapse
+
+ if self.timestamp_from:
+ params["from"] = self.timestamp_from
+
+ if self.timestamp_to:
+ params["to"] = self.timestamp_to
+
+ if self.limit:
+ params["limit"] = self.limit
+
+ if self.offset:
+ params["offset"] = self.offset
+
+ if self.matchtype:
+ params["matchType"] = self.matchtype
+
+ try:
+ response = get_response(url=url, params=params)
+ return response.json()
+ except ReadTimeoutError:
+ if config.verbose:
+ rprint("[red]Connection to web.archive.org timed out.")
+ except ConnectionError:
+ if config.verbose:
+ rprint(
+ "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
+ )
+ except HTTPError as e:
+ if config.verbose:
+ rprint(f"[red]HTTP error occurred: {str(e)}")
+ except EmptyResponseError:
+ if config.verbose:
+ rprint("[red]No data was saved due to an empty response.")
+ except GetResponseError as e:
+ if config.verbose:
+ rprint(f"[red]An error occurred: {str(e)}")
+
+ return None
+++ /dev/null
-"""
-Requests data from the Wayback Machine API.
-"""
-
-from typing import Any, Dict, Optional
-
-from rich import print as rprint
-
-from waybacktweets.config.config import config
-from waybacktweets.exceptions.exceptions import (
- ConnectionError,
- EmptyResponseError,
- GetResponseError,
- HTTPError,
- ReadTimeoutError,
-)
-from waybacktweets.utils.utils import get_response
-
-
-class WaybackTweets:
- """
- Class responsible for requesting data from the Wayback CDX Server API.
-
- :param username: The username associated with the tweets.
- :param collapse: The field to collapse duplicate lines on.
- :param timestamp_from: The timestamp to start retrieving tweets from.
- :param timestamp_to: The timestamp to stop retrieving tweets at.
- :param limit: The maximum number of results to return.
- :param offset: The number of lines to skip in the results.
- :param matchType: Results matching a certain prefix, a certain host or all subdomains.
- """ # noqa: E501
-
- def __init__(
- self,
- username: str,
- collapse: str = None,
- timestamp_from: str = None,
- timestamp_to: str = None,
- limit: int = None,
- offset: int = None,
- matchtype: str = None,
- ):
- self.username = username
- self.collapse = collapse
- self.timestamp_from = timestamp_from
- self.timestamp_to = timestamp_to
- self.limit = limit
- self.offset = offset
- self.matchtype = matchtype
-
- def get(self) -> Optional[Dict[str, Any]]:
- """
- Sends a GET request to the Internet Archive's CDX API
- to retrieve archived tweets.
-
- :returns: The response from the CDX API in JSON format, if successful.
- """
- url = "https://web.archive.org/cdx/search/cdx"
-
- status_pathname = "status/*"
- if self.matchtype:
- status_pathname = ""
-
- params = {
- "url": f"https://twitter.com/{self.username}/{status_pathname}",
- "output": "json",
- }
-
- if self.collapse:
- params["collapse"] = self.collapse
-
- if self.timestamp_from:
- params["from"] = self.timestamp_from
-
- if self.timestamp_to:
- params["to"] = self.timestamp_to
-
- if self.limit:
- params["limit"] = self.limit
-
- if self.offset:
- params["offset"] = self.offset
-
- if self.matchtype:
- params["matchType"] = self.matchtype
-
- try:
- response = get_response(url=url, params=params)
- return response.json()
- except ReadTimeoutError:
- if config.verbose:
- rprint("[red]Connection to web.archive.org timed out.")
- except ConnectionError:
- if config.verbose:
- rprint(
- "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
- )
- except HTTPError as e:
- if config.verbose:
- rprint(f"[red]HTTP error occurred: {str(e)}")
- except EmptyResponseError:
- if config.verbose:
- rprint("[red]No data was saved due to an empty response.")
- except GetResponseError as e:
- if config.verbose:
- rprint(f"[red]An error occurred: {str(e)}")
-
- return None
--- /dev/null
+# flake8: noqa: E501
+"""
+Generates an HTML file to visualize the parsed data.
+"""
+
+import json
+from typing import Any, Dict, List
+
+
+class HTMLTweetsVisualizer:
+ """
+ Class responsible for generating an HTML file to visualize the parsed data.
+
+ :param json_content: The content of the JSON file.
+ :param html_file_path: The path where the HTML file will be saved.
+ :param username: The username associated with the tweets.
+ """
+
+ def __init__(self, json_file_path: str, html_file_path: str, username: str):
+ self.json_content = self._json_loader(json_file_path)
+ self.html_file_path = html_file_path
+ self.username = username
+
+ @staticmethod
+ def _json_loader(json_file_path: str) -> List[Dict[str, Any]]:
+ """
+ Reads and loads JSON data from a specified file path.
+
+ :param json_file_path: The path of the JSON file.
+
+ :returns: The content of the JSON file.
+ """
+ with open(json_file_path, "r", encoding="utf-8") as f:
+ return json.load(f)
+
+ def generate(self) -> str:
+ """
+ Generates an HTML string that represents the parsed data.
+
+ :returns: The generated HTML string.
+ """
+
+ html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
+ html += "<style>\n"
+ html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
+ html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
+ html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #fff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; }\n"
+ html += ".tweet strong { font-weight: bold; }\n"
+ html += ".tweet a { color: #ef5552; text-decoration: none; }\n"
+ html += ".content { color: #ef5552; }\n"
+ html += ".tweet a:hover { text-decoration: underline; }\n"
+ html += "h1, h3 { text-align: center; }\n"
+ html += "iframe { width: 600px; height: 600px; }\n"
+ html += "</style>\n"
+ html += "</head>\n<body>\n"
+ html += f"<h1>@{self.username} archived tweets</h1>\n"
+ html += '<div class="container">\n'
+
+ for tweet in self.json_content:
+ html += '<div class="tweet">\n'
+
+ # TODO: JSON Issue
+ # if (
+ # (
+ # tweet["archived_mimetype"] != "application/json"
+ # and not tweet["parsed_tweet_text_mimetype_json"]
+ # )
+ # and not tweet["available_tweet_text"]
+ # ) or (
+ # (
+ # tweet["archived_mimetype"] == "application/json"
+ # and not tweet["parsed_tweet_text_mimetype_json"]
+ # )
+ # and not tweet["available_tweet_text"]
+ # ):
+ if (
+ tweet["archived_mimetype"] != "application/json"
+ and not tweet["available_tweet_text"]
+ ):
+ html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
+
+ html += f'<p><a href="{tweet["original_tweet_url"]}" target="_blank"><strong>Original Tweet↗</strong></a> · \n'
+ html += f'<a href="{tweet["parsed_tweet_url"]}" target="_blank"><strong>Parsed Tweet↗</strong></a> · \n'
+ html += f'<a href="{tweet["archived_tweet_url"]}" target="_blank"><strong>Archived Tweet↗</strong></a> · \n'
+ html += f'<a href="{tweet["parsed_archived_tweet_url"]}" target="_blank"><strong>Parsed Archived Tweet↗</strong></a></p>\n'
+
+ if tweet["available_tweet_text"]:
+ html += "<br>\n"
+ html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
+ html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
+ html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
+
+ # TODO: JSON Issue
+ # if (
+ # tweet["archived_mimetype"] == "application/json"
+ # and tweet["parsed_tweet_text_mimetype_json"]
+ # ) and not tweet["available_tweet_text"]:
+ # html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'
+
+ html += "<br>\n"
+ html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
+ html += f'<p><strong>Archived Timestamp:</strong> {tweet["archived_timestamp"]}</p>\n'
+ html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
+ html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
+ html += (
+ f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'
+ )
+ html += (
+ f'<p><strong>Archived Length:</strong> {tweet["archived_length"]}</p>\n'
+ )
+ html += "</div>\n"
+
+ html += "</div>\n"
+ html += '<h3>generated by <a href="https://github.com/claromes/waybacktweets" target="_blank">Wayback Tweets↗</a></h3>\n'
+ html += "</body>\n</html>"
+
+ return html
+
+ def save(self, html_content: str) -> None:
+ """
+ Saves the generated HTML string to a file.
+
+ :param html_content: The HTML string to be saved.
+ """
+ with open(self.html_file_path, "w", encoding="utf-8") as f:
+ f.write(html_content)
+++ /dev/null
-# flake8: noqa: E501
-"""
-Generates an HTML file to visualize the parsed data.
-"""
-
-import json
-from typing import Any, Dict, List
-
-
-class HTMLTweetsVisualizer:
- """
- Class responsible for generating an HTML file to visualize the parsed data.
-
- :param json_content: The content of the JSON file.
- :param html_file_path: The path where the HTML file will be saved.
- :param username: The username associated with the tweets.
- """
-
- def __init__(self, json_file_path: str, html_file_path: str, username: str):
- self.json_content = self._json_loader(json_file_path)
- self.html_file_path = html_file_path
- self.username = username
-
- @staticmethod
- def _json_loader(json_file_path: str) -> List[Dict[str, Any]]:
- """
- Reads and loads JSON data from a specified file path.
-
- :param json_file_path: The path of the JSON file.
-
- :returns: The content of the JSON file.
- """
- with open(json_file_path, "r", encoding="utf-8") as f:
- return json.load(f)
-
- def generate(self) -> str:
- """
- Generates an HTML string that represents the parsed data.
-
- :returns: The generated HTML string.
- """
-
- html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
- html += "<style>\n"
- html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
- html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
- html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #fff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; }\n"
- html += ".tweet strong { font-weight: bold; }\n"
- html += ".tweet a { color: #ef5552; text-decoration: none; }\n"
- html += ".content { color: #ef5552; }\n"
- html += ".tweet a:hover { text-decoration: underline; }\n"
- html += "h1, h3 { text-align: center; }\n"
- html += "iframe { width: 600px; height: 600px; }\n"
- html += "</style>\n"
- html += "</head>\n<body>\n"
- html += f"<h1>@{self.username} archived tweets</h1>\n"
- html += '<div class="container">\n'
-
- for tweet in self.json_content:
- html += '<div class="tweet">\n'
-
- # TODO: JSON Issue
- # if (
- # (
- # tweet["archived_mimetype"] != "application/json"
- # and not tweet["parsed_tweet_text_mimetype_json"]
- # )
- # and not tweet["available_tweet_text"]
- # ) or (
- # (
- # tweet["archived_mimetype"] == "application/json"
- # and not tweet["parsed_tweet_text_mimetype_json"]
- # )
- # and not tweet["available_tweet_text"]
- # ):
- if (
- tweet["archived_mimetype"] != "application/json"
- and not tweet["available_tweet_text"]
- ):
- html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
-
- html += f'<p><a href="{tweet["original_tweet_url"]}" target="_blank"><strong>Original Tweet↗</strong></a> · \n'
- html += f'<a href="{tweet["parsed_tweet_url"]}" target="_blank"><strong>Parsed Tweet↗</strong></a> · \n'
- html += f'<a href="{tweet["archived_tweet_url"]}" target="_blank"><strong>Archived Tweet↗</strong></a> · \n'
- html += f'<a href="{tweet["parsed_archived_tweet_url"]}" target="_blank"><strong>Parsed Archived Tweet↗</strong></a></p>\n'
-
- if tweet["available_tweet_text"]:
- html += "<br>\n"
- html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
- html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
- html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
-
- # TODO: JSON Issue
- # if (
- # tweet["archived_mimetype"] == "application/json"
- # and tweet["parsed_tweet_text_mimetype_json"]
- # ) and not tweet["available_tweet_text"]:
- # html += f'<p><strong class="content">Parsed Tweet Text (application/json):</strong> {tweet["parsed_tweet_text_mimetype_json"]}</p>\n'
-
- html += "<br>\n"
- html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
- html += f'<p><strong>Archived Timestamp:</strong> {tweet["archived_timestamp"]}</p>\n'
- html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
- html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
- html += (
- f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'
- )
- html += (
- f'<p><strong>Archived Length:</strong> {tweet["archived_length"]}</p>\n'
- )
- html += "</div>\n"
-
- html += "</div>\n"
- html += '<h3>generated by <a href="https://github.com/claromes/waybacktweets" target="_blank">Wayback Tweets↗</a></h3>\n'
- html += "</body>\n</html>"
-
- return html
-
- def save(self, html_content: str) -> None:
- """
- Saves the generated HTML string to a file.
-
- :param html_content: The HTML string to be saved.
- """
- with open(self.html_file_path, "w", encoding="utf-8") as f:
- f.write(html_content)
config = _Config()
"""
-Configuration settings..
+Configuration settings.
.. attribute:: verbose