From: Claromes Date: Tue, 4 Jun 2024 16:59:17 +0000 (-0300) Subject: add classes X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=d9f190c1d0dcb4278dd4969e8bb9b698b958da58;p=waybacktweets.git add classes --- diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index 38c7702..1bd42a4 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -1,67 +1,78 @@ -""" -Exports the parsed archived tweets. -""" - import pandas as pd import re import datetime +import os +from viz_tweets import HTMLTweetsVisualizer + + +class TweetsExporter: + """Handles the exporting of parsed archived tweets.""" + + def __init__(self, data, username, metadata_options): + self.data = data + self.username = username + self.metadata_options = metadata_options + self.formatted_datetime = self.datetime_now() + self.filename = f'{self.username}_tweets_{self.formatted_datetime}' + self.dataframe = self.create_dataframe(self) -from viz_tweets import * + @staticmethod + def datetime_now(): + """Formats datetime.""" + now = datetime.datetime.now() + formatted_now = now.strftime("%Y%m%d%H%M%S") + formatted_now = re.sub(r'\W+', '', formatted_now) + return formatted_now -def datetime_now(): - """Formats datetime.""" - now = datetime.datetime.now() + @staticmethod + def transpose_matrix(data, fill_value=None): + """Transposes a matrix, filling in missing values with a specified fill value if needed.""" + max_length = max(len(sublist) for sublist in data.values()) - formatted_now = now.strftime("%Y%m%d%H%M%S") + filled_data = { + key: value + [fill_value] * (max_length - len(value)) + for key, value in data.items() + } - formatted_now = re.sub(r'\W+', '', formatted_now) + return filled_data - return formatted_now + @staticmethod + def create_dataframe(self): + """Creates a DataFrame from the transposed data.""" + data_transposed = self.transpose_matrix(self.data) + df = pd.DataFrame(data_transposed, columns=self.metadata_options) -def transpose_matrix(data, fill_value=None): - """Transposes a matrix, filling in missing values with a specified fill value if needed.""" - max_length = max(len(sublist) for sublist in data) - filled_data = [ - sublist + [fill_value] * (max_length - len(sublist)) - for sublist in data - ] + return df - data_transposed = [list(row) for row in zip(*filled_data)] + def save_to_csv(self): + """Saves the DataFrame to a CSV file.""" + csv_file_path = f'{self.filename}.csv' + self.dataframe.to_csv(csv_file_path, index=False) - return data_transposed + print(f'Saved to {csv_file_path}') + def save_to_json(self): + """Saves the DataFrame to a JSON file.""" + json_file_path = f'{self.filename}.json' + self.dataframe.to_json(json_file_path, orient='records', lines=False) -def save_tweets(data, username): - """Saves parsed archived tweets in CSV, JSON, and HTML formats.""" - data_transposed = transpose_matrix(data) + print(f'Saved to {json_file_path}') - formatted_datetime = datetime_now() - filename = f'{username}_tweets_{formatted_datetime}' + def save_to_html(self): + """Saves the DataFrame to an HTML file.""" + json_file_path = f'{self.filename}.json' - df = pd.DataFrame(data_transposed, - columns=[ - 'archived_urlkey', 'archived_timestamp', 'tweet', - 'archived_tweet', 'parsed_tweet', - 'parsed_tweet_mimetype_json', - 'parsed_archived_tweet', 'archived_mimetype', - 'archived_statuscode', 'archived_digest', - 'archived_length', 'available_tweet_content', - 'available_tweet_is_RT', 'available_tweet_username' - ]) + if not os.path.exists(json_file_path): + self.save_to_json() - csv_file_path = f'{filename}.csv' - df.to_csv(csv_file_path, index=False) + html_file_path = f'{self.filename}.html' - json_file_path = f'{filename}.json' - df.to_json(json_file_path, orient='records', lines=False) + html = HTMLTweetsVisualizer(json_file_path, html_file_path, + self.username) - html_file_path = f'{filename}.html' - json_content = read_json(json_file_path) - html_content = generate_html(json_content, username) - save_html(html_file_path, html_content) + html_content = html.generate() + html.save(html_content) - print( - f'Done. Check the files {filename}.csv, {filename}.json and {filename}.html' - ) + print(f'Saved to {html_file_path}') diff --git a/waybacktweets/main.py b/waybacktweets/main.py index 0fc9ca2..8dac791 100644 --- a/waybacktweets/main.py +++ b/waybacktweets/main.py @@ -2,9 +2,9 @@ Main function for retrieving archived tweets. """ -from request_tweets import * -from parse_tweets import * -from export_tweets import * +from request_tweets import WaybackTweets +from parse_tweets import TweetsParser +from export_tweets import TweetsExporter username = 'claromes' unique = False @@ -13,18 +13,33 @@ datetime_to = '' def main(): - """Invokes the functions to retrieve archived tweets, perform necessary parsing, and save the data.""" + """Invokes the classes to retrieve archived tweets, perform necessary parsing, and save the data.""" try: - archived_tweets = get_archived_tweets(username, unique, datetime_from, - datetime_to) - if archived_tweets: - data = parse_archived_tweets(archived_tweets, username) - - save_tweets(data, username) + api = WaybackTweets(username) + archived_tweets = api.get() - print( - f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.' - ) + if archived_tweets: + metadata_options = [ + 'archived_urlkey', 'archived_timestamp', 'tweet', + 'archived_tweet', 'parsed_tweet', 'parsed_tweet_mimetype_json', + 'available_tweet_content', 'available_tweet_is_RT', + 'available_tweet_username', 'parsed_archived_tweet', + 'archived_mimetype', 'archived_statuscode', 'archived_digest', + 'archived_length' + ] + + parser = TweetsParser(archived_tweets, username, metadata_options) + parsed_tweets = parser.parse() + + exporter = TweetsExporter(parsed_tweets, username, + metadata_options) + exporter.save_to_csv() + exporter.save_to_json() + exporter.save_to_html() + + print( + f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.' + ) except TypeError as e: print(e) diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py index 3513cb4..6f86356 100644 --- a/waybacktweets/parse_tweets.py +++ b/waybacktweets/parse_tweets.py @@ -1,148 +1,149 @@ -""" -Parses the returned data from the Wayback CDX Server API. -""" - import requests import re from urllib.parse import unquote from utils import * -def embed(tweet): - """Parses the archived tweets when the tweets are still available using the Twitter Publish service from X. - Returns the text of the tweet, if it's a retweet, and the username of the account.""" - try: - url = f'https://publish.twitter.com/oembed?url={tweet}' - response = requests.get(url) - - regex = r'
]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>' - regex_author = r'^(.*?)\s*\(' - - if not (400 <= response.status_code <= 511): - html = response.json()['html'] - author_name = response.json()['author_name'] - - matches_html = re.findall(regex, html, re.DOTALL) - - tweet_content = [] - user_info = [] - is_RT = [] - - for match in matches_html: - tweet_content_match = re.sub(r']*>|<\/a>', '', - match[0].strip()) - tweet_content_match = tweet_content_match.replace('
', '\n') - - user_info_match = re.sub(r']*>|<\/a>', '', - match[1].strip()) - user_info_match = user_info_match.replace(')', '), ') - - match_author = re.search(regex_author, user_info_match) - author_tweet = match_author.group(1) - - if tweet_content_match: - tweet_content.append(tweet_content_match) - if user_info_match: - user_info.append(user_info_match) - - is_RT_match = False - if author_name != author_tweet: - is_RT_match = True - - is_RT.append(is_RT_match) - - return tweet_content, is_RT, user_info - except: - return None - - -def parse_json_mimetype(tweet): - """Parses the archived tweets when the mimetype is application/json and returns the text of the tweet.""" - response_json = requests.get(tweet) - - if not (400 <= response_json.status_code <= 511): - json_data = response_json.json() - - if 'data' in json_data: - if 'text' in json_data['data']: - json_text = json_data['data']['text'] - return json_text - else: - json_text = json_data['data'] - return json_text - else: - if 'text' in json_data: - json_text = json_data['text'] - return json_text - else: - json_text = json_data - return json_text - - -def parse_archived_tweets(archived_tweets_response, username): - """Parses the archived tweets metadata and structures it in a more readable format.""" - archived_urlkey = [] - archived_timestamp = [] - tweet = [] - archived_tweet = [] - parsed_tweet = [] - parsed_tweet_mimetype_json = [] - available_tweet_content = [] - available_tweet_is_RT = [] - available_tweet_username = [] - parsed_archived_tweet = [] - archived_mimetype = [] - archived_statuscode = [] - archived_digest = [] - archived_length = [] - - for response in archived_tweets_response[1:]: - tweet_remove_char = unquote(response[2]).replace('’', '') - cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"') - - wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{tweet_remove_char}' - - original_tweet = delete_tweet_pathnames( - clean_tweet_url(cleaned_tweet, username)) - - parsed_wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{original_tweet}' - - double_status = check_double_status(wayback_machine_url, - original_tweet) - - if double_status: - original_tweet = delete_tweet_pathnames( - f'https://twitter.com/{original_tweet}') - - elif not '://' in original_tweet: +class TwitterEmbed: + """Handles parsing of tweets using the Twitter Publish service.""" + + def __init__(self, tweet_url): + self.tweet_url = tweet_url + + def embed(self): + """Parses the archived tweets when they are still available.""" + try: + url = f'https://publish.twitter.com/oembed?url={self.tweet_url}' + response = requests.get(url) + if not (400 <= response.status_code <= 511): + html = response.json()['html'] + author_name = response.json()['author_name'] + + regex = r'