add basic docstrings

author Claromes <claromes@hey.com>

Thu, 30 May 2024 11:17:32 +0000 (08:17 -0300)

committer Claromes <claromes@hey.com>

Thu, 30 May 2024 11:17:32 +0000 (08:17 -0300)
author Claromes <claromes@hey.com>
Thu, 30 May 2024 11:17:32 +0000 (08:17 -0300)
committer Claromes <claromes@hey.com>
Thu, 30 May 2024 11:17:32 +0000 (08:17 -0300)
diff --git a/waybacktweets/__init__.py b/waybacktweets/__init__.py

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4802e90f8edc38346215a3277cf96f0fa7474c32 100644 (file)
--- a/waybacktweets/__init__.py
+++ b/waybacktweets/__init__.py
@@ -0,0 +1 @@
+__version__ = "1.0"
diff --git a/waybacktweets/export_tweets.py b/waybacktweets/export_tweets.py

index 03847c9c17be9e39bd55e6d60fa2d6232e01594e..38c770248db766b607697d6887647758b0ddfbdd 100644 (file)
--- a/waybacktweets/export_tweets.py
+++ b/waybacktweets/export_tweets.py
@@ -1,3 +1,7 @@
+"""
+Exports the parsed archived tweets.
+"""
+
  import pandas as pd
  import re
  import datetime
@@ -6,6 +10,7 @@ from viz_tweets import *
  
  
  def datetime_now():
+    """Formats datetime."""
      now = datetime.datetime.now()
  
      formatted_now = now.strftime("%Y%m%d%H%M%S")
@@ -16,6 +21,7 @@ def datetime_now():
  
  
  def transpose_matrix(data, fill_value=None):
+    """Transposes a matrix, filling in missing values with a specified fill value if needed."""
      max_length = max(len(sublist) for sublist in data)
      filled_data = [
          sublist + [fill_value] * (max_length - len(sublist))
@@ -27,7 +33,8 @@ def transpose_matrix(data, fill_value=None):
      return data_transposed
  
  
-def response_tweets(data, username):
+def save_tweets(data, username):
+    """Saves parsed archived tweets in CSV, JSON, and HTML formats."""
      data_transposed = transpose_matrix(data)
  
      formatted_datetime = datetime_now()
@@ -51,7 +58,6 @@ def response_tweets(data, username):
      df.to_json(json_file_path, orient='records', lines=False)
  
      html_file_path = f'{filename}.html'
-
      json_content = read_json(json_file_path)
      html_content = generate_html(json_content, username)
      save_html(html_file_path, html_content)
diff --git a/waybacktweets/main.py b/waybacktweets/main.py

index eb1f795646d5e0ce75ad3eab7aab542b851a5e18..0fc9ca23232a48563f6e4566f2987d9fcd98f49b 100644 (file)
--- a/waybacktweets/main.py
+++ b/waybacktweets/main.py
@@ -1,3 +1,7 @@
+"""
+Main function for retrieving archived tweets.
+"""
+
  from request_tweets import *
  from parse_tweets import *
  from export_tweets import *
@@ -9,13 +13,14 @@ datetime_to = ''
  
  
  def main():
+    """Invokes the functions to retrieve archived tweets, perform necessary parsing, and save the data."""
      try:
          archived_tweets = get_archived_tweets(username, unique, datetime_from,
                                                datetime_to)
          if archived_tweets:
              data = parse_archived_tweets(archived_tweets, username)
  
-            response_tweets(data, username)
+            save_tweets(data, username)
  
          print(
              f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.'
diff --git a/waybacktweets/parse_tweets.py b/waybacktweets/parse_tweets.py

index 5486326b8657d5d9c9ac4dd1aa55e3a2c9dfba4e..3513cb4c85d67f85bdfef7d97276d9d544261512 100644 (file)
--- a/waybacktweets/parse_tweets.py
+++ b/waybacktweets/parse_tweets.py
@@ -1,3 +1,7 @@
+"""
+Parses the returned data from the Wayback CDX Server API.
+"""
+
  import requests
  import re
  from urllib.parse import unquote
@@ -5,6 +9,8 @@ from utils import *
  
  
  def embed(tweet):
+    """Parses the archived tweets when the tweets are still available using the Twitter Publish service from X.
+    Returns the text of the tweet, if it's a retweet, and the username of the account."""
      try:
          url = f'https://publish.twitter.com/oembed?url={tweet}'
          response = requests.get(url)
@@ -51,6 +57,7 @@ def embed(tweet):
  
  
  def parse_json_mimetype(tweet):
+    """Parses the archived tweets when the mimetype is application/json and returns the text of the tweet."""
      response_json = requests.get(tweet)
  
      if not (400 <= response_json.status_code <= 511):
@@ -73,6 +80,7 @@ def parse_json_mimetype(tweet):
  
  
  def parse_archived_tweets(archived_tweets_response, username):
+    """Parses the archived tweets metadata and structures it in a more readable format."""
      archived_urlkey = []
      archived_timestamp = []
      tweet = []
@@ -95,7 +103,7 @@ def parse_archived_tweets(archived_tweets_response, username):
          wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{tweet_remove_char}'
  
          original_tweet = delete_tweet_pathnames(
-            clean_tweet(cleaned_tweet, username))
+            clean_tweet_url(cleaned_tweet, username))
  
          parsed_wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{original_tweet}'
  
@@ -138,37 +146,3 @@ def parse_archived_tweets(archived_tweets_response, username):
          archived_length.append(response[6])
  
      return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_tweet_mimetype_json, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username
-
-
-# if tweet_links[i]:
-#     link = parsed_links[i]
-#     tweet = embed(tweet_links[i])
-
-# parse = parse_links(links)
-# parsed_links = parse[0]
-# tweet_links = parse[1]
-# mimetype = parse[2]
-# timestamp = parse[3]
-
-# def display_not_tweet():
-#     original_link = delete_tweet_pathnames(clean_tweet(tweet_links[i]))
-
-#     if status:
-#         original_link = delete_tweet_pathnames(
-#             f'https://twitter.com/{tweet_links[i]}')
-#     elif not '://' in tweet_links[i]:
-#         original_link = delete_tweet_pathnames(f'https://{tweet_links[i]}')
-
-#     response_html = requests.get(original_link)
-
-#     if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[
-#             i] == 'unk':
-#         if ('.jpg' in tweet_links[i] or '.png'
-#                 in tweet_links[i]) and response_html.status_code == 200:
-#             components.iframe(tweet_links[i], height=500, scrolling=True)
-#         elif '/status/' not in original_link:
-#             st.info("This isn't a status or is not available")
-#         elif status or f'{st.session_state.current_handle}' not in original_link:
-#             st.info(f'Replying to {st.session_state.current_handle}')
-#         else:
-#             components.iframe(clean_link(link), height=500, scrolling=True)
diff --git a/waybacktweets/request_tweets.py b/waybacktweets/request_tweets.py

index 72f9f0ea22b4f9049fd101ae49fb65544d07ab49..71699480a5e2fd43db5259162f11dd1a5a7a6cf3 100644 (file)
--- a/waybacktweets/request_tweets.py
+++ b/waybacktweets/request_tweets.py
@@ -1,3 +1,7 @@
+"""
+Requests data from the Wayback Machine API.
+"""
+
  import requests
  
  
@@ -5,7 +9,7 @@ def get_archived_tweets(username,
                          unique=False,
                          timestamp_from='',
                          timestamp_to=''):
-
+    """Requests data from the Wayback CDX Server API and returns it in JSON format."""
      unique = f'&collapse=urlkey' if unique else ''
  
      if timestamp_from:
diff --git a/waybacktweets/utils.py b/waybacktweets/utils.py

index c9ca7715998a91137498a053f858906cc8498684..837e272388ed1c55a1870cb8ecd5e99c1f2913e6 100644 (file)
--- a/waybacktweets/utils.py
+++ b/waybacktweets/utils.py
@@ -1,7 +1,13 @@
+"""
+Helper functions.
+"""
+
  import re
  
  
-def clean_tweet(tweet, username):
+def clean_tweet_url(tweet, username):
+    """Converts the tweet to lowercase, checks if it contains a tweet URL associated with the username.
+    Returns the original tweet URL with correct casing; or returns the original tweet."""
      tweet_lower = tweet.lower()
  
      pattern = re.compile(r'/status/(\d+)')
@@ -16,6 +22,8 @@ def clean_tweet(tweet, username):
  
  def clean_wayback_machine_url(wayback_machine_url, archived_timestamp,
                                username):
+    """Converts the Wayback Machine URL to lowercase, checks if it contains a tweet URL associated with the username.
+    Returns the original tweet URL with correct casing and archived timestamp; otherwise, it returns the original Wayback Machine URL."""
      wayback_machine_url = wayback_machine_url.lower()
  
      pattern = re.compile(r'/status/(\d+)')
@@ -28,10 +36,11 @@ def clean_wayback_machine_url(wayback_machine_url, archived_timestamp,
  
  
  def pattern_tweet(tweet):
-    # Reply: /status//
-    # Link:  /status///
-    # Twimg: /status/https://pbs
+    """Extracts tweet IDs from various types of tweet URLs or tweet-related patterns.
  
+    Reply pattern: /status//
+    Link pattern:  /status///
+    Twimg pattern: /status/https://pbs"""
      pattern = re.compile(r'/status/"([^"]+)"')
  
      match = pattern.search(tweet)
@@ -42,8 +51,7 @@ def pattern_tweet(tweet):
  
  
  def delete_tweet_pathnames(tweet):
-    # Delete pathnames (/photos, /likes, /retweet...)
-
+    """Removes any pathnames (/photos, /likes, /retweet...) from the tweet URL."""
      pattern_username = re.compile(r'https://twitter\.com/([^/]+)/status/\d+')
      match_username = pattern_username.match(tweet)
  
@@ -59,6 +67,8 @@ def delete_tweet_pathnames(tweet):
  
  
  def check_double_status(wayback_machine_url, original_tweet):
+    """Checks if a Wayback Machine URL contains two occurrences of "/status/" and if the original tweet does not contain "twitter.com".
+    Returns a boolean."""
      if wayback_machine_url.count(
              '/status/') == 2 and not 'twitter.com' in original_tweet:
          return True
@@ -67,4 +77,5 @@ def check_double_status(wayback_machine_url, original_tweet):
  
  
  def semicolon_parse(string):
+    """Replaces semicolons in a string with %3B."""
      return ''.join('%3B' if c == ';' else c for c in string)
diff --git a/waybacktweets/viz_tweets.py b/waybacktweets/viz_tweets.py

index ac4729795e6e22a5097b12f0fb8c758f570bc2a8..a3c88d5b43ba9250c37751521193e89fc9426bce 100644 (file)
--- a/waybacktweets/viz_tweets.py
+++ b/waybacktweets/viz_tweets.py
@@ -1,12 +1,18 @@
+"""
+Generates an HTML file to visualize the parsed data.
+"""
+
  import json
  
  
  def read_json(json_file_path):
+    """Reads and loads JSON data from a specified file path."""
      with open(json_file_path, 'r', encoding='utf-8') as f:
          return json.load(f)
  
  
  def generate_html(json_content, username):
+    """Generates an HTML file."""
      html = f'<html>\n<head>\n<title>@{username} archived tweets</title>\n'
      html += '<style>\n'
      html += 'body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n'
@@ -46,5 +52,6 @@ def generate_html(json_content, username):
  
  
  def save_html(html_file_path, html_content):
+    """Saves the generated HTML."""
      with open(html_file_path, 'w', encoding='utf-8') as f:
          f.write(html_content)
author	Claromes <claromes@hey.com>
	Thu, 30 May 2024 11:17:32 +0000 (08:17 -0300)
committer	Claromes <claromes@hey.com>
	Thu, 30 May 2024 11:17:32 +0000 (08:17 -0300)
waybacktweets/__init__.py		patch \| blob \| history
waybacktweets/export_tweets.py		patch \| blob \| history
waybacktweets/main.py		patch \| blob \| history
waybacktweets/parse_tweets.py		patch \| blob \| history
waybacktweets/request_tweets.py		patch \| blob \| history
waybacktweets/utils.py		patch \| blob \| history
waybacktweets/viz_tweets.py		patch \| blob \| history