From: Claromes Date: Tue, 28 May 2024 07:39:50 +0000 (-0300) Subject: parse json archived tweet X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=025fbe6594991db1370bb0f9df493ed61d8082c2;p=waybacktweets.git parse json archived tweet --- diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index 39be684..efcd015 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -35,6 +35,7 @@ def response_tweets_csv(data, username): columns=[ 'archived_urlkey', 'archived_timestamp', 'tweet', 'archived_tweet', 'parsed_tweet', + 'parsed_tweet_mimetype_json', 'parsed_archived_tweet', 'archived_mimetype', 'archived_statuscode', 'archived_digest', 'archived_length', 'available_tweet_content', diff --git a/waybacktweets/main.py b/waybacktweets/main.py index 84e26ce..ae63093 100644 --- a/waybacktweets/main.py +++ b/waybacktweets/main.py @@ -2,7 +2,7 @@ from request_tweets import * from tweet_parse import * from export_tweets import * -username = 'cnn' +username = 'claromes' unique = False datetime_from = '' datetime_to = '' diff --git a/waybacktweets/tweet_parse.py b/waybacktweets/tweet_parse.py index 847264d..167e1ce 100644 --- a/waybacktweets/tweet_parse.py +++ b/waybacktweets/tweet_parse.py @@ -50,12 +50,35 @@ def embed(tweet): return None +def parse_json_mimetype(tweet): + response_json = requests.get(tweet) + + if not (400 <= response_json.status_code <= 511): + json_data = response_json.json() + + if 'data' in json_data: + if 'text' in json_data['data']: + json_text = json_data['data']['text'] + return json_text + else: + json_text = json_data['data'] + return json_text + else: + if 'text' in json_data: + json_text = json_data['text'] + return json_text + else: + json_text = json_data + return json_text + + def parse_archived_tweets(archived_tweets_response, username): archived_urlkey = [] archived_timestamp = [] tweet = [] archived_tweet = [] parsed_tweet = [] + parsed_tweet_mimetype_json = [] available_tweet_content = [] available_tweet_is_RT = [] available_tweet_username = [] @@ -99,6 +122,10 @@ def parse_archived_tweets(archived_tweets_response, username): available_tweet_is_RT.append(content[1][0]) available_tweet_username.append(content[2][0]) + if response[3] == 'application/json': + json_mimetype = parse_json_mimetype(encoded_archived_tweet) + parsed_tweet_mimetype_json.append(json_mimetype) + archived_urlkey.append(response[0]) archived_timestamp.append(response[1]) tweet.append(encoded_tweet) @@ -110,7 +137,7 @@ def parse_archived_tweets(archived_tweets_response, username): archived_digest.append(response[5]) archived_length.append(response[6]) - return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username + return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_tweet_mimetype_json, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username # if tweet_links[i]: