From: Claromes Date: Thu, 30 May 2024 11:17:32 +0000 (-0300) Subject: add basic docstrings X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=db4359c8bfa703d65c8db48f5d095c1405c1ced9;p=waybacktweets.git add basic docstrings --- diff --git a/waybacktweets/__init__.py b/waybacktweets/__init__.py index e69de29..4802e90 100644 --- a/waybacktweets/__init__.py +++ b/waybacktweets/__init__.py @@ -0,0 +1 @@ +__version__ = "1.0" diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py index 03847c9..38c7702 100644 --- a/waybacktweets/export_tweets.py +++ b/waybacktweets/export_tweets.py @@ -1,3 +1,7 @@ +""" +Exports the parsed archived tweets. +""" + import pandas as pd import re import datetime @@ -6,6 +10,7 @@ from viz_tweets import * def datetime_now(): + """Formats datetime.""" now = datetime.datetime.now() formatted_now = now.strftime("%Y%m%d%H%M%S") @@ -16,6 +21,7 @@ def datetime_now(): def transpose_matrix(data, fill_value=None): + """Transposes a matrix, filling in missing values with a specified fill value if needed.""" max_length = max(len(sublist) for sublist in data) filled_data = [ sublist + [fill_value] * (max_length - len(sublist)) @@ -27,7 +33,8 @@ def transpose_matrix(data, fill_value=None): return data_transposed -def response_tweets(data, username): +def save_tweets(data, username): + """Saves parsed archived tweets in CSV, JSON, and HTML formats.""" data_transposed = transpose_matrix(data) formatted_datetime = datetime_now() @@ -51,7 +58,6 @@ def response_tweets(data, username): df.to_json(json_file_path, orient='records', lines=False) html_file_path = f'{filename}.html' - json_content = read_json(json_file_path) html_content = generate_html(json_content, username) save_html(html_file_path, html_content) diff --git a/waybacktweets/main.py b/waybacktweets/main.py index eb1f795..0fc9ca2 100644 --- a/waybacktweets/main.py +++ b/waybacktweets/main.py @@ -1,3 +1,7 @@ +""" +Main function for retrieving archived tweets. +""" + from request_tweets import * from parse_tweets import * from export_tweets import * @@ -9,13 +13,14 @@ datetime_to = '' def main(): + """Invokes the functions to retrieve archived tweets, perform necessary parsing, and save the data.""" try: archived_tweets = get_archived_tweets(username, unique, datetime_from, datetime_to) if archived_tweets: data = parse_archived_tweets(archived_tweets, username) - response_tweets(data, username) + save_tweets(data, username) print( f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.' diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py index 5486326..3513cb4 100644 --- a/waybacktweets/parse_tweets.py +++ b/waybacktweets/parse_tweets.py @@ -1,3 +1,7 @@ +""" +Parses the returned data from the Wayback CDX Server API. +""" + import requests import re from urllib.parse import unquote @@ -5,6 +9,8 @@ from utils import * def embed(tweet): + """Parses the archived tweets when the tweets are still available using the Twitter Publish service from X. + Returns the text of the tweet, if it's a retweet, and the username of the account.""" try: url = f'https://publish.twitter.com/oembed?url={tweet}' response = requests.get(url) @@ -51,6 +57,7 @@ def embed(tweet): def parse_json_mimetype(tweet): + """Parses the archived tweets when the mimetype is application/json and returns the text of the tweet.""" response_json = requests.get(tweet) if not (400 <= response_json.status_code <= 511): @@ -73,6 +80,7 @@ def parse_json_mimetype(tweet): def parse_archived_tweets(archived_tweets_response, username): + """Parses the archived tweets metadata and structures it in a more readable format.""" archived_urlkey = [] archived_timestamp = [] tweet = [] @@ -95,7 +103,7 @@ def parse_archived_tweets(archived_tweets_response, username): wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{tweet_remove_char}' original_tweet = delete_tweet_pathnames( - clean_tweet(cleaned_tweet, username)) + clean_tweet_url(cleaned_tweet, username)) parsed_wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{original_tweet}' @@ -138,37 +146,3 @@ def parse_archived_tweets(archived_tweets_response, username): archived_length.append(response[6]) return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_tweet_mimetype_json, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username - - -# if tweet_links[i]: -# link = parsed_links[i] -# tweet = embed(tweet_links[i]) - -# parse = parse_links(links) -# parsed_links = parse[0] -# tweet_links = parse[1] -# mimetype = parse[2] -# timestamp = parse[3] - -# def display_not_tweet(): -# original_link = delete_tweet_pathnames(clean_tweet(tweet_links[i])) - -# if status: -# original_link = delete_tweet_pathnames( -# f'https://twitter.com/{tweet_links[i]}') -# elif not '://' in tweet_links[i]: -# original_link = delete_tweet_pathnames(f'https://{tweet_links[i]}') - -# response_html = requests.get(original_link) - -# if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[ -# i] == 'unk': -# if ('.jpg' in tweet_links[i] or '.png' -# in tweet_links[i]) and response_html.status_code == 200: -# components.iframe(tweet_links[i], height=500, scrolling=True) -# elif '/status/' not in original_link: -# st.info("This isn't a status or is not available") -# elif status or f'{st.session_state.current_handle}' not in original_link: -# st.info(f'Replying to {st.session_state.current_handle}') -# else: -# components.iframe(clean_link(link), height=500, scrolling=True) diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py index 72f9f0e..7169948 100644 --- a/waybacktweets/request_tweets.py +++ b/waybacktweets/request_tweets.py @@ -1,3 +1,7 @@ +""" +Requests data from the Wayback Machine API. +""" + import requests @@ -5,7 +9,7 @@ def get_archived_tweets(username, unique=False, timestamp_from='', timestamp_to=''): - + """Requests data from the Wayback CDX Server API and returns it in JSON format.""" unique = f'&collapse=urlkey' if unique else '' if timestamp_from: diff --git a/waybacktweets/utils.py b/waybacktweets/utils.py index c9ca771..837e272 100644 --- a/waybacktweets/utils.py +++ b/waybacktweets/utils.py @@ -1,7 +1,13 @@ +""" +Helper functions. +""" + import re -def clean_tweet(tweet, username): +def clean_tweet_url(tweet, username): + """Converts the tweet to lowercase, checks if it contains a tweet URL associated with the username. + Returns the original tweet URL with correct casing; or returns the original tweet.""" tweet_lower = tweet.lower() pattern = re.compile(r'/status/(\d+)') @@ -16,6 +22,8 @@ def clean_tweet(tweet, username): def clean_wayback_machine_url(wayback_machine_url, archived_timestamp, username): + """Converts the Wayback Machine URL to lowercase, checks if it contains a tweet URL associated with the username. + Returns the original tweet URL with correct casing and archived timestamp; otherwise, it returns the original Wayback Machine URL.""" wayback_machine_url = wayback_machine_url.lower() pattern = re.compile(r'/status/(\d+)') @@ -28,10 +36,11 @@ def clean_wayback_machine_url(wayback_machine_url, archived_timestamp, def pattern_tweet(tweet): - # Reply: /status// - # Link: /status/// - # Twimg: /status/https://pbs + """Extracts tweet IDs from various types of tweet URLs or tweet-related patterns. + Reply pattern: /status// + Link pattern: /status/// + Twimg pattern: /status/https://pbs""" pattern = re.compile(r'/status/"([^"]+)"') match = pattern.search(tweet) @@ -42,8 +51,7 @@ def pattern_tweet(tweet): def delete_tweet_pathnames(tweet): - # Delete pathnames (/photos, /likes, /retweet...) - + """Removes any pathnames (/photos, /likes, /retweet...) from the tweet URL.""" pattern_username = re.compile(r'https://twitter\.com/([^/]+)/status/\d+') match_username = pattern_username.match(tweet) @@ -59,6 +67,8 @@ def delete_tweet_pathnames(tweet): def check_double_status(wayback_machine_url, original_tweet): + """Checks if a Wayback Machine URL contains two occurrences of "/status/" and if the original tweet does not contain "twitter.com". + Returns a boolean.""" if wayback_machine_url.count( '/status/') == 2 and not 'twitter.com' in original_tweet: return True @@ -67,4 +77,5 @@ def check_double_status(wayback_machine_url, original_tweet): def semicolon_parse(string): + """Replaces semicolons in a string with %3B.""" return ''.join('%3B' if c == ';' else c for c in string) diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py index ac47297..a3c88d5 100644 --- a/waybacktweets/viz_tweets.py +++ b/waybacktweets/viz_tweets.py @@ -1,12 +1,18 @@ +""" +Generates an HTML file to visualize the parsed data. +""" + import json def read_json(json_file_path): + """Reads and loads JSON data from a specified file path.""" with open(json_file_path, 'r', encoding='utf-8') as f: return json.load(f) def generate_html(json_content, username): + """Generates an HTML file.""" html = f'\n\n@{username} archived tweets\n' html += '