From 8a4debb7f9a7dc8682c4b24d7bdb8a5b137d10d3 Mon Sep 17 00:00:00 2001 From: Claromes Date: Tue, 28 May 2024 06:30:07 -0300 Subject: [PATCH] add generate_html --- .gitignore | 1 + waybacktweets/export_tweets.py | 14 +++++- waybacktweets/main.py | 4 +- .../{tweet_parse.py => parse_tweets.py} | 39 --------------- waybacktweets/request_tweets.py | 2 +- waybacktweets/viz_tweets.py | 50 +++++++++++++++++++ 6 files changed, 66 insertions(+), 44 deletions(-) rename waybacktweets/{tweet_parse.py => parse_tweets.py} (82%) create mode 100644 waybacktweets/viz_tweets.py diff --git a/.gitignore b/.gitignore index 64b33db..04e940c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .venv/ *.csv *.json +*.html waybacktweets/__pycache__ waybacktweets/notes.md diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index efcd015..03847c9 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -2,6 +2,8 @@ import pandas as pd import re import datetime +from viz_tweets import * + def datetime_now(): now = datetime.datetime.now() @@ -25,7 +27,7 @@ def transpose_matrix(data, fill_value=None): return data_transposed -def response_tweets_csv(data, username): +def response_tweets(data, username): data_transposed = transpose_matrix(data) formatted_datetime = datetime_now() @@ -48,4 +50,12 @@ def response_tweets_csv(data, username): json_file_path = f'{filename}.json' df.to_json(json_file_path, orient='records', lines=False) - print(f'Done. Check the files {filename}.csv and {filename}.json') + html_file_path = f'{filename}.html' + + json_content = read_json(json_file_path) + html_content = generate_html(json_content, username) + save_html(html_file_path, html_content) + + print( + f'Done. Check the files {filename}.csv, {filename}.json and {filename}.html' + ) diff --git a/waybacktweets/main.py b/waybacktweets/main.py index ae63093..eb1f795 100644 --- a/waybacktweets/main.py +++ b/waybacktweets/main.py @@ -1,5 +1,5 @@ from request_tweets import * -from tweet_parse import * +from parse_tweets import * from export_tweets import * username = 'claromes' @@ -15,7 +15,7 @@ def main(): if archived_tweets: data = parse_archived_tweets(archived_tweets, username) - response_tweets_csv(data, username) + response_tweets(data, username) print( f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.' diff --git a/waybacktweets/tweet_parse.py b/waybacktweets/parse_tweets.py similarity index 82% rename from waybacktweets/tweet_parse.py rename to waybacktweets/parse_tweets.py index 167e1ce..5486326 100644 --- a/waybacktweets/tweet_parse.py +++ b/waybacktweets/parse_tweets.py @@ -172,42 +172,3 @@ def parse_archived_tweets(archived_tweets_response, username): # st.info(f'Replying to {st.session_state.current_handle}') # else: # components.iframe(clean_link(link), height=500, scrolling=True) - -# elif mimetype[i] == 'application/json': -# try: -# response_json = requests.get(link) - -# if response_json.status_code == 200: -# json_data = response_json.json() - -# if 'data' in json_data: -# if 'text' in json_data['data']: -# json_text = json_data['data']['text'] -# else: -# json_text = json_data['data'] -# else: -# if 'text' in json_data: -# json_text = json_data['text'] -# else: -# json_text = json_data - -# st.code(json_text) -# st.json(json_data, expanded=False) - -# st.divider() -# else: -# st.error(response_json.status_code) - -# st.divider() -# except requests.exceptions.Timeout: -# st.error('Connection to web.archive.org timed out.') -# st.divider() -# except requests.exceptions.ConnectionError: -# st.error( -# 'Failed to establish a new connection with web.archive.org.') -# st.divider() -# except UnboundLocalError: -# st.empty() -# else: -# st.warning('MIME Type was not parsed.') -# st.divider() diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index cc4b2a2..72f9f0e 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -14,7 +14,7 @@ def get_archived_tweets(username, if timestamp_to: timestamp_to = f'&to={timestamp_to}' - url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json{unique}{timestamp_from}{timestamp_to}&limit=10' + url = f'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{username}/status/*&output=json{unique}{timestamp_from}{timestamp_to}&limit=100' print(f'Getting and parsing archived tweets from {url}') try: diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py new file mode 100644 index 0000000..ac47297 --- /dev/null +++ b/waybacktweets/viz_tweets.py @@ -0,0 +1,50 @@ +import json + + +def read_json(json_file_path): + with open(json_file_path, 'r', encoding='utf-8') as f: + return json.load(f) + + +def generate_html(json_content, username): + html = f'\n\n@{username} archived tweets\n' + html += '\n' + html += '\n\n' + html += f'

@{username} archived tweets

\n' + html += '
\n' + + for tweet in json_content: + html += '
\n' + html += f'

Archived Timestamp: {tweet["archived_timestamp"]}

\n' + html += f'

Archived URL Key: {tweet["archived_urlkey"]}

\n' + html += f'

Tweet: {tweet["tweet"]}

\n' + html += f'

Archived Tweet: {tweet["archived_tweet"]}

\n' + html += f'

Parsed Tweet: {tweet["parsed_tweet"]}

\n' + html += f'

Parsed Tweet Mimetype JSON: {tweet["parsed_tweet_mimetype_json"]}

\n' + html += f'

Parsed Archived Tweet: {tweet["parsed_archived_tweet"]}

\n' + html += f'

Archived Mimetype: {tweet["archived_mimetype"]}

\n' + html += f'

Archived Statuscode: {tweet["archived_statuscode"]}

\n' + html += f'

Archived Digest: {tweet["archived_digest"]}

\n' + html += f'

Archived Length: {tweet["archived_length"]}

\n' + html += f'

Available Tweet Content: {tweet["available_tweet_content"]}

\n' + html += f'

Available Tweet Is Retweet: {tweet["available_tweet_is_RT"]}

\n' + html += f'

Available Tweet Username: {tweet["available_tweet_username"]}

\n' + html += '
\n' + + html += '
\n' + html += '\n' + + return html + + +def save_html(html_file_path, html_content): + with open(html_file_path, 'w', encoding='utf-8') as f: + f.write(html_content) -- 2.34.1