From: Claromes Date: Tue, 28 May 2024 07:19:39 +0000 (-0300) Subject: parse available tweet X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=571e09fcfb9febce0edf79b95c18f792be2e1cd0;p=waybacktweets.git parse available tweet --- diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index babb19b..39be684 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -13,8 +13,20 @@ def datetime_now(): return formatted_now +def transpose_matrix(data, fill_value=None): + max_length = max(len(sublist) for sublist in data) + filled_data = [ + sublist + [fill_value] * (max_length - len(sublist)) + for sublist in data + ] + + data_transposed = [list(row) for row in zip(*filled_data)] + + return data_transposed + + def response_tweets_csv(data, username): - data_transposed = list(zip(*data)) + data_transposed = transpose_matrix(data) formatted_datetime = datetime_now() filename = f'{username}_tweets_{formatted_datetime}' @@ -25,7 +37,8 @@ def response_tweets_csv(data, username): 'archived_tweet', 'parsed_tweet', 'parsed_archived_tweet', 'archived_mimetype', 'archived_statuscode', 'archived_digest', - 'archived_length' + 'archived_length', 'available_tweet_content', + 'available_tweet_is_RT', 'available_tweet_username' ]) csv_file_path = f'{filename}.csv' diff --git a/waybacktweets/main.py b/waybacktweets/main.py index c3dacfc..84e26ce 100644 --- a/waybacktweets/main.py +++ b/waybacktweets/main.py @@ -2,7 +2,7 @@ from request_tweets import * from tweet_parse import * from export_tweets import * -username = 'dfrlab' +username = 'cnn' unique = False datetime_from = '' datetime_to = '' @@ -16,6 +16,10 @@ def main(): data = parse_archived_tweets(archived_tweets, username) response_tweets_csv(data, username) + + print( + f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.' + ) except TypeError as e: print(e) diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index 3986ff3..cc4b2a2 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -14,7 +14,7 @@ def get_archived_tweets(username, if timestamp_to: timestamp_to = f'&to={timestamp_to}' - url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json{unique}{timestamp_from}{timestamp_to}' + url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json{unique}{timestamp_from}{timestamp_to}&limit=10' print(f'Getting and parsing archived tweets from {url}') try: diff --git a/waybacktweets/tweet_parse.py b/waybacktweets/tweet_parse.py index b9ed287..847264d 100644 --- a/waybacktweets/tweet_parse.py +++ b/waybacktweets/tweet_parse.py @@ -1,13 +1,64 @@ +import requests +import re from urllib.parse import unquote from utils import * +def embed(tweet): + try: + url = f'https://publish.twitter.com/oembed?url={tweet}' + response = requests.get(url) + + regex = r'
]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>' + regex_author = r'^(.*?)\s*\(' + + if not (400 <= response.status_code <= 511): + html = response.json()['html'] + author_name = response.json()['author_name'] + + matches_html = re.findall(regex, html, re.DOTALL) + + tweet_content = [] + user_info = [] + is_RT = [] + + for match in matches_html: + tweet_content_match = re.sub(r']*>|<\/a>', '', + match[0].strip()) + tweet_content_match = tweet_content_match.replace('
', '\n') + + user_info_match = re.sub(r']*>|<\/a>', '', + match[1].strip()) + user_info_match = user_info_match.replace(')', '), ') + + match_author = re.search(regex_author, user_info_match) + author_tweet = match_author.group(1) + + if tweet_content_match: + tweet_content.append(tweet_content_match) + if user_info_match: + user_info.append(user_info_match) + + is_RT_match = False + if author_name != author_tweet: + is_RT_match = True + + is_RT.append(is_RT_match) + + return tweet_content, is_RT, user_info + except: + return None + + def parse_archived_tweets(archived_tweets_response, username): archived_urlkey = [] archived_timestamp = [] tweet = [] archived_tweet = [] parsed_tweet = [] + available_tweet_content = [] + available_tweet_is_RT = [] + available_tweet_username = [] parsed_archived_tweet = [] archived_mimetype = [] archived_statuscode = [] @@ -42,6 +93,12 @@ def parse_archived_tweets(archived_tweets_response, username): encoded_parsed_archived_tweet = semicolon_parse( parsed_wayback_machine_url) + content = embed(encoded_tweet) + if content: + available_tweet_content.append(content[0][0]) + available_tweet_is_RT.append(content[1][0]) + available_tweet_username.append(content[2][0]) + archived_urlkey.append(response[0]) archived_timestamp.append(response[1]) tweet.append(encoded_tweet) @@ -53,77 +110,18 @@ def parse_archived_tweets(archived_tweets_response, username): archived_digest.append(response[5]) archived_length.append(response[6]) - return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length - - -# def embed(tweet): -# try: -# url = f'https://publish.twitter.com/oembed?url={clean_tweet(tweet)}' -# response = requests.get(url) - -# regex = r'