@{username} archived tweets

From: Claromes Date: Tue, 28 May 2024 09:30:07 +0000 (-0300) Subject: add generate_html X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=8a4debb7f9a7dc8682c4b24d7bdb8a5b137d10d3;p=waybacktweets.git add generate_html --- diff --git a/.gitignore b/.gitignore index 64b33db..04e940c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .venv/ *.csv *.json +*.html waybacktweets/__pycache__ waybacktweets/notes.md diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index efcd015..03847c9 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -2,6 +2,8 @@ import pandas as pd import re import datetime +from viz_tweets import * + def datetime_now(): now = datetime.datetime.now() @@ -25,7 +27,7 @@ def transpose_matrix(data, fill_value=None): return data_transposed -def response_tweets_csv(data, username): +def response_tweets(data, username): data_transposed = transpose_matrix(data) formatted_datetime = datetime_now() @@ -48,4 +50,12 @@ def response_tweets_csv(data, username): json_file_path = f'{filename}.json' df.to_json(json_file_path, orient='records', lines=False) - print(f'Done. Check the files {filename}.csv and {filename}.json') + html_file_path = f'{filename}.html' + + json_content = read_json(json_file_path) + html_content = generate_html(json_content, username) + save_html(html_file_path, html_content) + + print( + f'Done. Check the files {filename}.csv, {filename}.json and {filename}.html' + ) diff --git a/waybacktweets/main.py b/waybacktweets/main.py index ae63093..eb1f795 100644 --- a/waybacktweets/main.py +++ b/waybacktweets/main.py @@ -1,5 +1,5 @@ from request_tweets import * -from tweet_parse import * +from parse_tweets import * from export_tweets import * username = 'claromes' @@ -15,7 +15,7 @@ def main(): if archived_tweets: data = parse_archived_tweets(archived_tweets, username) - response_tweets_csv(data, username) + response_tweets(data, username) print( f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.' diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py new file mode 100644 index 0000000..5486326 --- /dev/null +++ b/waybacktweets/parse_tweets.py @@ -0,0 +1,174 @@ +import requests +import re +from urllib.parse import unquote +from utils import * + + +def embed(tweet): + try: + url = f'https://publish.twitter.com/oembed?url={tweet}' + response = requests.get(url) + + regex = r'

]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>' + regex_author = r'^(.*?)\s*\(' + + if not (400 <= response.status_code <= 511): + html = response.json()['html'] + author_name = response.json()['author_name'] + + matches_html = re.findall(regex, html, re.DOTALL) + + tweet_content = [] + user_info = [] + is_RT = [] + + for match in matches_html: + tweet_content_match = re.sub(r']*>|<\/a>', '', + match[0].strip()) + tweet_content_match = tweet_content_match.replace('
', '\n') + + user_info_match = re.sub(r']*>|<\/a>', '', + match[1].strip()) + user_info_match = user_info_match.replace(')', '), ') + + match_author = re.search(regex_author, user_info_match) + author_tweet = match_author.group(1) + + if tweet_content_match: + tweet_content.append(tweet_content_match) + if user_info_match: + user_info.append(user_info_match) + + is_RT_match = False + if author_name != author_tweet: + is_RT_match = True + + is_RT.append(is_RT_match) + + return tweet_content, is_RT, user_info + except: + return None + + +def parse_json_mimetype(tweet): + response_json = requests.get(tweet) + + if not (400 <= response_json.status_code <= 511): + json_data = response_json.json() + + if 'data' in json_data: + if 'text' in json_data['data']: + json_text = json_data['data']['text'] + return json_text + else: + json_text = json_data['data'] + return json_text + else: + if 'text' in json_data: + json_text = json_data['text'] + return json_text + else: + json_text = json_data + return json_text + + +def parse_archived_tweets(archived_tweets_response, username): + archived_urlkey = [] + archived_timestamp = [] + tweet = [] + archived_tweet = [] + parsed_tweet = [] + parsed_tweet_mimetype_json = [] + available_tweet_content = [] + available_tweet_is_RT = [] + available_tweet_username = [] + parsed_archived_tweet = [] + archived_mimetype = [] + archived_statuscode = [] + archived_digest = [] + archived_length = [] + + for response in archived_tweets_response[1:]: + tweet_remove_char = unquote(response[2]).replace('â', '') + cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"') + + wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{tweet_remove_char}' + + original_tweet = delete_tweet_pathnames( + clean_tweet(cleaned_tweet, username)) + + parsed_wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{original_tweet}' + + double_status = check_double_status(wayback_machine_url, + original_tweet) + + if double_status: + original_tweet = delete_tweet_pathnames( + f'https://twitter.com/{original_tweet}') + + elif not '://' in original_tweet: + original_tweet = delete_tweet_pathnames( + f'https://{original_tweet}') + + encoded_tweet = semicolon_parse(response[2]) + encoded_archived_tweet = semicolon_parse(wayback_machine_url) + encoded_parsed_tweet = semicolon_parse(original_tweet) + encoded_parsed_archived_tweet = semicolon_parse( + parsed_wayback_machine_url) + + content = embed(encoded_tweet) + if content: + available_tweet_content.append(content[0][0]) + available_tweet_is_RT.append(content[1][0]) + available_tweet_username.append(content[2][0]) + + if response[3] == 'application/json': + json_mimetype = parse_json_mimetype(encoded_archived_tweet) + parsed_tweet_mimetype_json.append(json_mimetype) + + archived_urlkey.append(response[0]) + archived_timestamp.append(response[1]) + tweet.append(encoded_tweet) + archived_tweet.append(encoded_archived_tweet) + parsed_tweet.append(encoded_parsed_tweet) + parsed_archived_tweet.append(encoded_parsed_archived_tweet) + archived_mimetype.append(response[3]) + archived_statuscode.append(response[4]) + archived_digest.append(response[5]) + archived_length.append(response[6]) + + return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_tweet_mimetype_json, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username + + +# if tweet_links[i]: +# link = parsed_links[i] +# tweet = embed(tweet_links[i]) + +# parse = parse_links(links) +# parsed_links = parse[0] +# tweet_links = parse[1] +# mimetype = parse[2] +# timestamp = parse[3] + +# def display_not_tweet(): +# original_link = delete_tweet_pathnames(clean_tweet(tweet_links[i])) + +# if status: +# original_link = delete_tweet_pathnames( +# f'https://twitter.com/{tweet_links[i]}') +# elif not '://' in tweet_links[i]: +# original_link = delete_tweet_pathnames(f'https://{tweet_links[i]}') + +# response_html = requests.get(original_link) + +# if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[ +# i] == 'unk': +# if ('.jpg' in tweet_links[i] or '.png' +# in tweet_links[i]) and response_html.status_code == 200: +# components.iframe(tweet_links[i], height=500, scrolling=True) +# elif '/status/' not in original_link: +# st.info("This isn't a status or is not available") +# elif status or f'{st.session_state.current_handle}' not in original_link: +# st.info(f'Replying to {st.session_state.current_handle}') +# else: +# components.iframe(clean_link(link), height=500, scrolling=True) diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index cc4b2a2..72f9f0e 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -14,7 +14,7 @@ def get_archived_tweets(username, if timestamp_to: timestamp_to = f'&to={timestamp_to}' - url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json{unique}{timestamp_from}{timestamp_to}&limit=10' + url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json{unique}{timestamp_from}{timestamp_to}&limit=100' print(f'Getting and parsing archived tweets from {url}') try: diff --git a/waybacktweets/tweet_parse.py b/waybacktweets/tweet_parse.py deleted file mode 100644 index 167e1ce..0000000 --- a/waybacktweets/tweet_parse.py +++ /dev/null @@ -1,213 +0,0 @@ -import requests -import re -from urllib.parse import unquote -from utils import * - - -def embed(tweet): - try: - url = f'https://publish.twitter.com/oembed?url={tweet}' - response = requests.get(url) - - regex = r'
]+)?>]*>(.*?)<\/p>.*?— (.*?)<\/a>' - regex_author = r'^(.*?)\s*\(' - - if not (400 <= response.status_code <= 511): - html = response.json()['html'] - author_name = response.json()['author_name'] - - matches_html = re.findall(regex, html, re.DOTALL) - - tweet_content = [] - user_info = [] - is_RT = [] - - for match in matches_html: - tweet_content_match = re.sub(r']*>|<\/a>', '', - match[0].strip()) - tweet_content_match = tweet_content_match.replace('
', '\n') - - user_info_match = re.sub(r']*>|<\/a>', '', - match[1].strip()) - user_info_match = user_info_match.replace(')', '), ') - - match_author = re.search(regex_author, user_info_match) - author_tweet = match_author.group(1) - - if tweet_content_match: - tweet_content.append(tweet_content_match) - if user_info_match: - user_info.append(user_info_match) - - is_RT_match = False - if author_name != author_tweet: - is_RT_match = True - - is_RT.append(is_RT_match) - - return tweet_content, is_RT, user_info - except: - return None - - -def parse_json_mimetype(tweet): - response_json = requests.get(tweet) - - if not (400 <= response_json.status_code <= 511): - json_data = response_json.json() - - if 'data' in json_data: - if 'text' in json_data['data']: - json_text = json_data['data']['text'] - return json_text - else: - json_text = json_data['data'] - return json_text - else: - if 'text' in json_data: - json_text = json_data['text'] - return json_text - else: - json_text = json_data - return json_text - - -def parse_archived_tweets(archived_tweets_response, username): - archived_urlkey = [] - archived_timestamp = [] - tweet = [] - archived_tweet = [] - parsed_tweet = [] - parsed_tweet_mimetype_json = [] - available_tweet_content = [] - available_tweet_is_RT = [] - available_tweet_username = [] - parsed_archived_tweet = [] - archived_mimetype = [] - archived_statuscode = [] - archived_digest = [] - archived_length = [] - - for response in archived_tweets_response[1:]: - tweet_remove_char = unquote(response[2]).replace('â', '') - cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"') - - wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{tweet_remove_char}' - - original_tweet = delete_tweet_pathnames( - clean_tweet(cleaned_tweet, username)) - - parsed_wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{original_tweet}' - - double_status = check_double_status(wayback_machine_url, - original_tweet) - - if double_status: - original_tweet = delete_tweet_pathnames( - f'https://twitter.com/{original_tweet}') - - elif not '://' in original_tweet: - original_tweet = delete_tweet_pathnames( - f'https://{original_tweet}') - - encoded_tweet = semicolon_parse(response[2]) - encoded_archived_tweet = semicolon_parse(wayback_machine_url) - encoded_parsed_tweet = semicolon_parse(original_tweet) - encoded_parsed_archived_tweet = semicolon_parse( - parsed_wayback_machine_url) - - content = embed(encoded_tweet) - if content: - available_tweet_content.append(content[0][0]) - available_tweet_is_RT.append(content[1][0]) - available_tweet_username.append(content[2][0]) - - if response[3] == 'application/json': - json_mimetype = parse_json_mimetype(encoded_archived_tweet) - parsed_tweet_mimetype_json.append(json_mimetype) - - archived_urlkey.append(response[0]) - archived_timestamp.append(response[1]) - tweet.append(encoded_tweet) - archived_tweet.append(encoded_archived_tweet) - parsed_tweet.append(encoded_parsed_tweet) - parsed_archived_tweet.append(encoded_parsed_archived_tweet) - archived_mimetype.append(response[3]) - archived_statuscode.append(response[4]) - archived_digest.append(response[5]) - archived_length.append(response[6]) - - return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_tweet_mimetype_json, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username - - -# if tweet_links[i]: -# link = parsed_links[i] -# tweet = embed(tweet_links[i]) - -# parse = parse_links(links) -# parsed_links = parse[0] -# tweet_links = parse[1] -# mimetype = parse[2] -# timestamp = parse[3] - -# def display_not_tweet(): -# original_link = delete_tweet_pathnames(clean_tweet(tweet_links[i])) - -# if status: -# original_link = delete_tweet_pathnames( -# f'https://twitter.com/{tweet_links[i]}') -# elif not '://' in tweet_links[i]: -# original_link = delete_tweet_pathnames(f'https://{tweet_links[i]}') - -# response_html = requests.get(original_link) - -# if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[ -# i] == 'unk': -# if ('.jpg' in tweet_links[i] or '.png' -# in tweet_links[i]) and response_html.status_code == 200: -# components.iframe(tweet_links[i], height=500, scrolling=True) -# elif '/status/' not in original_link: -# st.info("This isn't a status or is not available") -# elif status or f'{st.session_state.current_handle}' not in original_link: -# st.info(f'Replying to {st.session_state.current_handle}') -# else: -# components.iframe(clean_link(link), height=500, scrolling=True) - -# elif mimetype[i] == 'application/json': -# try: -# response_json = requests.get(link) - -# if response_json.status_code == 200: -# json_data = response_json.json() - -# if 'data' in json_data: -# if 'text' in json_data['data']: -# json_text = json_data['data']['text'] -# else: -# json_text = json_data['data'] -# else: -# if 'text' in json_data: -# json_text = json_data['text'] -# else: -# json_text = json_data - -# st.code(json_text) -# st.json(json_data, expanded=False) - -# st.divider() -# else: -# st.error(response_json.status_code) - -# st.divider() -# except requests.exceptions.Timeout: -# st.error('Connection to web.archive.org timed out.') -# st.divider() -# except requests.exceptions.ConnectionError: -# st.error( -# 'Failed to establish a new connection with web.archive.org.') -# st.divider() -# except UnboundLocalError: -# st.empty() -# else: -# st.warning('MIME Type was not parsed.') -# st.divider() diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py new file mode 100644 index 0000000..ac47297 --- /dev/null +++ b/waybacktweets/viz_tweets.py @@ -0,0 +1,50 @@ +import json + + +def read_json(json_file_path): + with open(json_file_path, 'r', encoding='utf-8') as f: + return json.load(f) + + +def generate_html(json_content, username): + html = f'\n\n@{username} archived tweets\n' + html += '\n' + html += '\n\n' + html += f'
@{username} archived tweets
\n' + html += '
\n' + + for tweet in json_content: + html += '
\n' + html += f'
Archived Timestamp: {tweet["archived_timestamp"]}
\n' + html += f'
Archived URL Key: {tweet["archived_urlkey"]}
\n' + html += f'
Tweet: {tweet["tweet"]}
\n' + html += f'
Archived Tweet: {tweet["archived_tweet"]}
\n' + html += f'
Parsed Tweet: {tweet["parsed_tweet"]}
\n' + html += f'
Parsed Tweet Mimetype JSON: {tweet["parsed_tweet_mimetype_json"]}
\n' + html += f'
Parsed Archived Tweet: {tweet["parsed_archived_tweet"]}
\n' + html += f'
Archived Mimetype: {tweet["archived_mimetype"]}
\n' + html += f'
Archived Statuscode: {tweet["archived_statuscode"]}
\n' + html += f'
Archived Digest: {tweet["archived_digest"]}
\n' + html += f'
Archived Length: {tweet["archived_length"]}
\n' + html += f'
Available Tweet Content: {tweet["available_tweet_content"]}
\n' + html += f'
Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}
\n' + html += f'
Available Tweet Username: {tweet["available_tweet_username"]}
\n' + html += '
\n' + + html += '
\n' + html += '\n' + + return html + + +def save_html(html_file_path, html_content): + with open(html_file_path, 'w', encoding='utf-8') as f: + f.write(html_content)