+"""
+Exports the parsed archived tweets.
+"""
+
import pandas as pd
import re
import datetime
def datetime_now():
+ """Formats datetime."""
now = datetime.datetime.now()
formatted_now = now.strftime("%Y%m%d%H%M%S")
def transpose_matrix(data, fill_value=None):
+ """Transposes a matrix, filling in missing values with a specified fill value if needed."""
max_length = max(len(sublist) for sublist in data)
filled_data = [
sublist + [fill_value] * (max_length - len(sublist))
return data_transposed
-def response_tweets(data, username):
+def save_tweets(data, username):
+ """Saves parsed archived tweets in CSV, JSON, and HTML formats."""
data_transposed = transpose_matrix(data)
formatted_datetime = datetime_now()
df.to_json(json_file_path, orient='records', lines=False)
html_file_path = f'{filename}.html'
-
json_content = read_json(json_file_path)
html_content = generate_html(json_content, username)
save_html(html_file_path, html_content)
+"""
+Main function for retrieving archived tweets.
+"""
+
from request_tweets import *
from parse_tweets import *
from export_tweets import *
def main():
+ """Invokes the functions to retrieve archived tweets, perform necessary parsing, and save the data."""
try:
archived_tweets = get_archived_tweets(username, unique, datetime_from,
datetime_to)
if archived_tweets:
data = parse_archived_tweets(archived_tweets, username)
- response_tweets(data, username)
+ save_tweets(data, username)
print(
f'\nNeed help? Open an issue: https://github.com/claromes/waybacktweets/issues.'
+"""
+Parses the returned data from the Wayback CDX Server API.
+"""
+
import requests
import re
from urllib.parse import unquote
def embed(tweet):
+ """Parses the archived tweets when the tweets are still available using the Twitter Publish service from X.
+ Returns the text of the tweet, if it's a retweet, and the username of the account."""
try:
url = f'https://publish.twitter.com/oembed?url={tweet}'
response = requests.get(url)
def parse_json_mimetype(tweet):
+ """Parses the archived tweets when the mimetype is application/json and returns the text of the tweet."""
response_json = requests.get(tweet)
if not (400 <= response_json.status_code <= 511):
def parse_archived_tweets(archived_tweets_response, username):
+ """Parses the archived tweets metadata and structures it in a more readable format."""
archived_urlkey = []
archived_timestamp = []
tweet = []
wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{tweet_remove_char}'
original_tweet = delete_tweet_pathnames(
- clean_tweet(cleaned_tweet, username))
+ clean_tweet_url(cleaned_tweet, username))
parsed_wayback_machine_url = f'https://web.archive.org/web/{response[1]}/{original_tweet}'
archived_length.append(response[6])
return archived_urlkey, archived_timestamp, tweet, archived_tweet, parsed_tweet, parsed_tweet_mimetype_json, parsed_archived_tweet, archived_mimetype, archived_statuscode, archived_digest, archived_length, available_tweet_content, available_tweet_is_RT, available_tweet_username
-
-
-# if tweet_links[i]:
-# link = parsed_links[i]
-# tweet = embed(tweet_links[i])
-
-# parse = parse_links(links)
-# parsed_links = parse[0]
-# tweet_links = parse[1]
-# mimetype = parse[2]
-# timestamp = parse[3]
-
-# def display_not_tweet():
-# original_link = delete_tweet_pathnames(clean_tweet(tweet_links[i]))
-
-# if status:
-# original_link = delete_tweet_pathnames(
-# f'https://twitter.com/{tweet_links[i]}')
-# elif not '://' in tweet_links[i]:
-# original_link = delete_tweet_pathnames(f'https://{tweet_links[i]}')
-
-# response_html = requests.get(original_link)
-
-# if mimetype[i] == 'text/html' or mimetype[i] == 'warc/revisit' or mimetype[
-# i] == 'unk':
-# if ('.jpg' in tweet_links[i] or '.png'
-# in tweet_links[i]) and response_html.status_code == 200:
-# components.iframe(tweet_links[i], height=500, scrolling=True)
-# elif '/status/' not in original_link:
-# st.info("This isn't a status or is not available")
-# elif status or f'{st.session_state.current_handle}' not in original_link:
-# st.info(f'Replying to {st.session_state.current_handle}')
-# else:
-# components.iframe(clean_link(link), height=500, scrolling=True)
+"""
+Requests data from the Wayback Machine API.
+"""
+
import requests
unique=False,
timestamp_from='',
timestamp_to=''):
-
+ """Requests data from the Wayback CDX Server API and returns it in JSON format."""
unique = f'&collapse=urlkey' if unique else ''
if timestamp_from:
+"""
+Helper functions.
+"""
+
import re
-def clean_tweet(tweet, username):
+def clean_tweet_url(tweet, username):
+ """Converts the tweet to lowercase, checks if it contains a tweet URL associated with the username.
+ Returns the original tweet URL with correct casing; or returns the original tweet."""
tweet_lower = tweet.lower()
pattern = re.compile(r'/status/(\d+)')
def clean_wayback_machine_url(wayback_machine_url, archived_timestamp,
username):
+ """Converts the Wayback Machine URL to lowercase, checks if it contains a tweet URL associated with the username.
+ Returns the original tweet URL with correct casing and archived timestamp; otherwise, it returns the original Wayback Machine URL."""
wayback_machine_url = wayback_machine_url.lower()
pattern = re.compile(r'/status/(\d+)')
def pattern_tweet(tweet):
- # Reply: /status//
- # Link: /status///
- # Twimg: /status/https://pbs
+ """Extracts tweet IDs from various types of tweet URLs or tweet-related patterns.
+ Reply pattern: /status//
+ Link pattern: /status///
+ Twimg pattern: /status/https://pbs"""
pattern = re.compile(r'/status/"([^"]+)"')
match = pattern.search(tweet)
def delete_tweet_pathnames(tweet):
- # Delete pathnames (/photos, /likes, /retweet...)
-
+ """Removes any pathnames (/photos, /likes, /retweet...) from the tweet URL."""
pattern_username = re.compile(r'https://twitter\.com/([^/]+)/status/\d+')
match_username = pattern_username.match(tweet)
def check_double_status(wayback_machine_url, original_tweet):
+ """Checks if a Wayback Machine URL contains two occurrences of "/status/" and if the original tweet does not contain "twitter.com".
+ Returns a boolean."""
if wayback_machine_url.count(
'/status/') == 2 and not 'twitter.com' in original_tweet:
return True
def semicolon_parse(string):
+ """Replaces semicolons in a string with %3B."""
return ''.join('%3B' if c == ';' else c for c in string)
+"""
+Generates an HTML file to visualize the parsed data.
+"""
+
import json
def read_json(json_file_path):
+ """Reads and loads JSON data from a specified file path."""
with open(json_file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def generate_html(json_content, username):
+ """Generates an HTML file."""
html = f'<html>\n<head>\n<title>@{username} archived tweets</title>\n'
html += '<style>\n'
html += 'body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n'
def save_html(html_file_path, html_content):
+ """Saves the generated HTML."""
with open(html_file_path, 'w', encoding='utf-8') as f:
f.write(html_content)