.. autofunction:: check_double_status
.. autofunction:: check_pattern_tweet
+.. autofunction:: check_url_scheme
.. autofunction:: clean_tweet_url
.. autofunction:: clean_wayback_machine_url
.. autofunction:: delete_tweet_pathnames
.. autofunction:: get_response
.. autofunction:: is_tweet_url
.. autofunction:: semicolon_parser
+.. autofunction:: timestamp_parser
Exceptions
------------
These are the prerequisites:
-- Python 3.11+
+- Python 3.10+
- Poetry
Install from the source, following the :ref:`installation` instructions.
- ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
-- ``archived_timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
+- ``archived_timestamp``: (`str`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
-- ``original_tweet_url``: (`str`) The original tweet URL.
+- ``parsed_archived_timestamp``: (`str`) The ``archived_timestamp`` in human-readable format.
-- ``archived_tweet_url``: (`str`) The original archived URL.
+- ``archived_tweet_url``: (`str`) The archived URL.
-- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`.
+- ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
-- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
+- ``original_tweet_url``: (`str`) The original tweet URL.
+
+- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`.
- ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account.
packaging = "*"
pandas = ">=0.25"
toolz = "*"
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
[package.extras]
all = ["altair-tiles (>=0.3.0)", "anywidget (>=0.9.0)", "pyarrow (>=11)", "vega-datasets (>=0.9.0)", "vegafusion[embed] (>=1.6.6)", "vl-convert-python (>=1.3.0)"]
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
[[package]]
name = "filelock"
-version = "3.15.1"
+version = "3.15.3"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.8"
files = [
- {file = "filelock-3.15.1-py3-none-any.whl", hash = "sha256:71b3102950e91dfc1bb4209b64be4dc8854f40e5f534428d8684f953ac847fac"},
- {file = "filelock-3.15.1.tar.gz", hash = "sha256:58a2549afdf9e02e10720eaa4d4470f56386d7a6f72edd7d0596337af8ed7ad8"},
+ {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"},
+ {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"},
]
[package.extras]
docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"]
typing = ["typing-extensions (>=4.8)"]
[[package]]
[package.dependencies]
Flake8 = ">=5"
+TOMLi = {version = "*", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["pyTest", "pyTest-cov"]
[package.dependencies]
numpy = [
+ {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
sphinxcontrib-jsmath = "*"
sphinxcontrib-qthelp = "*"
sphinxcontrib-serializinghtml = ">=1.1.9"
+tomli = {version = ">=2", markers = "python_version < \"3.11\""}
[package.extras]
docs = ["sphinxcontrib-websupport"]
[[package]]
name = "sphinx-autodoc-typehints"
-version = "2.1.1"
+version = "2.2.0"
description = "Type hints (PEP 484) support for the Sphinx autodoc extension"
optional = false
python-versions = ">=3.9"
files = [
- {file = "sphinx_autodoc_typehints-2.1.1-py3-none-any.whl", hash = "sha256:22427d74786274add2b6d4afccb8b3c8c1843f48a704550f15a35fd948f8a4de"},
- {file = "sphinx_autodoc_typehints-2.1.1.tar.gz", hash = "sha256:0072b65f5ab2818c229d6d6c2cc993770af55d36bb7bfb16001e2fce4d14880c"},
+ {file = "sphinx_autodoc_typehints-2.2.0-py3-none-any.whl", hash = "sha256:143e22dbb096cc39f1559d3accbe423e5fbf04d02849d6564e6471b5616bbd97"},
+ {file = "sphinx_autodoc_typehints-2.2.0.tar.gz", hash = "sha256:a21f0120d8657545ad5ec269d7276b0718c367c8ff2fa8ad8767ddf2c660b909"},
]
[package.dependencies]
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
]
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
[[package]]
name = "toolz"
version = "0.12.1"
[metadata]
lock-version = "2.0"
-python-versions = "^3.11"
-content-hash = "37fcbc9255674bf67e65a2db35dbd71355fc97751141e739f31bb50fe708aa04"
+python-versions = "^3.10"
+content-hash = "4b34e093fd7034c803ee2d6b2a5598666e343c5b2562c2a2244d1528214bcacd"
[tool.poetry]
name = "waybacktweets"
-version = "1.0a1"
+version = "1.0a2"
description = "Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data."
authors = ["Claromes <support@claromes.com>"]
license = "GPLv3"
readme = "README.md"
-repository = "https://github.com/claromes/waybacktweets"
-documentation = "https://claromes.github.io/waybacktweets/"
keywords = [
"twitter",
"tweet",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Natural Language :: English",
+ "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Software Development",
"Topic :: Utilities",
exclude = ["app/**", "assets/**", "docs/**", ".streamlit/**"]
[tool.poetry.urls]
+"Homepage" = "https://claromes.github.io/waybacktweets/"
"Documentation" = "https://claromes.github.io/waybacktweets/"
"Issue Tracker" = "https://github.com/claromes/waybacktweets/issues"
[tool.poetry.dependencies]
-python = "^3.11"
+python = "^3.10"
requests = "^2.30.0"
-streamlit = "1.35.0"
rich = "^13.6.0"
click = "^8.1.7"
sphinx-autodoc-typehints = "^2.1.1"
[tool.poetry.group.dev.dependencies]
+streamlit = "1.35.0"
black = "^24.4.2"
flake8 = "^7.0.0"
isort = "^5.13.2"
field_options = [
"archived_urlkey",
"archived_timestamp",
- "original_tweet_url",
+ "parsed_archived_timestamp",
"archived_tweet_url",
- "parsed_tweet_url",
"parsed_archived_tweet_url",
+ "original_tweet_url",
+ "parsed_tweet_url",
"available_tweet_text",
"available_tweet_is_RT",
"available_tweet_info",
from waybacktweets.utils.utils import (
check_double_status,
check_pattern_tweet,
+ check_url_scheme,
clean_tweet_url,
delete_tweet_pathnames,
get_response,
is_tweet_url,
semicolon_parser,
+ timestamp_parser,
)
original_tweet = delete_tweet_pathnames(
clean_tweet_url(cleaned_tweet, self.username)
)
- parsed_wayback_machine_url = (
- f"https://web.archive.org/web/{response[1]}/{original_tweet}"
- )
double_status = check_double_status(wayback_machine_url, original_tweet)
if double_status:
original_tweet = delete_tweet_pathnames(
- f"https://twitter.com/{original_tweet}"
+ f"https://twitter.com{original_tweet}"
)
elif "://" not in original_tweet:
original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
- encoded_tweet = semicolon_parser(response[2])
- encoded_archived_tweet = semicolon_parser(wayback_machine_url)
- encoded_parsed_tweet = semicolon_parser(original_tweet)
- encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+ parsed_wayback_machine_url = (
+ f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+ )
+
+ encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url))
+ encoded_parsed_archived_tweet = check_url_scheme(
+ semicolon_parser(parsed_wayback_machine_url)
+ )
+ encoded_tweet = check_url_scheme(semicolon_parser(response[2]))
+ encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet))
available_tweet_text = None
available_tweet_is_RT = None
self._add_field("archived_urlkey", response[0])
self._add_field("archived_timestamp", response[1])
- self._add_field("original_tweet_url", encoded_tweet)
+ self._add_field("parsed_archived_timestamp", timestamp_parser(response[1]))
self._add_field("archived_tweet_url", encoded_archived_tweet)
- self._add_field("parsed_tweet_url", encoded_parsed_tweet)
self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+ self._add_field("original_tweet_url", encoded_tweet)
+ self._add_field("parsed_tweet_url", encoded_parsed_tweet)
self._add_field("archived_mimetype", response[3])
self._add_field("archived_statuscode", response[4])
self._add_field("archived_digest", response[5])
import json
from typing import Any, Dict, List
+from waybacktweets.utils import timestamp_parser
+
class HTMLTweetsVisualizer:
"""
The generated HTML string.
"""
- html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
+ html = f"<html>\n<!-- This content was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
+ html += f"\n<head>\n<title>@{self.username}'s archived tweets</title>\n"
html += "<style>\n"
html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
- html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #fff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; }\n"
+ html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
html += ".tweet strong { font-weight: bold; }\n"
- html += ".tweet a { color: #ef5552; text-decoration: none; }\n"
- html += ".content { color: #ef5552; }\n"
+ html += ".tweet a { color: #000000; text-decoration: none; }\n"
+ html += ".content { color: #000000; }\n"
+ html += ".source { font-size: 12px; text-align: center; }\n"
+ html += ".iframe_text { font-size: 12px; text-align: end; }\n"
html += ".tweet a:hover { text-decoration: underline; }\n"
html += "h1, h3 { text-align: center; }\n"
html += "iframe { width: 600px; height: 600px; }\n"
+ html += "input {\n"
+ html += "position: absolute;\n"
+ html += "opacity: 0;\n"
+ html += "z-index: -1;\n"
+ html += "}\n"
+ html += ".accordion {\n"
+ html += "margin: 10px;\n"
+ html += "border-radius: 5px;\n"
+ html += "overflow: hidden;\n"
+ html += "box-shadow: 0 4px 4px -2px rgba(0, 0, 0, 0.4);\n"
+ html += "}\n"
+ html += ".accordion-label {\n"
+ html += "display: flex;\n"
+ html += "justify-content: space-between;\n"
+ html += "padding: 1em;\n"
+ html += "font-weight: bold;\n"
+ html += "cursor: pointer;\n"
+ html += "background: #000000;\n"
+ html += "color: #ffffff;\n"
+ html += "}\n"
+ html += ".accordion-content {\n"
+ html += "max-height: 0;\n"
+ html += "padding: 0 1em;\n"
+ html += "background: white;\n"
+ html += "transition: all 0.35s;\n"
+ html += "}\n"
+ html += "input:checked ~ .accordion-content {\n"
+ html += "max-height: 100vh;\n"
+ html += " padding: 1em;\n"
+ html += "}\n"
html += "</style>\n"
html += "</head>\n<body>\n"
- html += f"<h1>@{self.username} archived tweets</h1>\n"
+ html += f"<h1>@{self.username}'s archived tweets</h1>\n"
html += '<div class="container">\n'
- for tweet in self.json_file_path:
+ for index, tweet in enumerate(self.json_file_path):
html += '<div class="tweet">\n'
if (
tweet["archived_mimetype"] != "application/json"
and not tweet["available_tweet_text"]
):
- html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
+ iframe_src = {
+ "Archived Tweet": tweet["archived_tweet_url"],
+ "Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
+ "Original Tweet": tweet["original_tweet_url"],
+ "Parsed Tweet": tweet["parsed_tweet_url"],
+ }
+
+ for key, value in iframe_src.items():
+ key_cleaned = key.replace(" ", "_")
+
+ html += f'<p class="iframe_text"><a href="{value}" target="_blank"><strong>{key}↗</strong></a>\n'
+ html += '<div class="accordion">\n'
+ html += (
+ f'<input type="checkbox" id="tab_{index}_{key_cleaned}" />\n'
+ )
+ html += f'<label class="accordion-label" for="tab_{index}_{key_cleaned}">Click to load the iframe from {key}</label>\n'
+ html += '<div class="accordion-content">\n'
+
+ html += f'<div id="loading_{index}_{key_cleaned}" class="loading">Loading...</div>\n'
+ html += f'<iframe id="iframe_{index}_{key_cleaned}" frameborder="0" scrolling="auto" loading="lazy" style="display: none;" onload="document.getElementById(\'loading_{index}_{key_cleaned}\').style.display=\'none\'; this.style.display=\'block\';"></iframe>\n'
+ html += "</div>\n"
+ html += "</div>\n"
+
+ html += """
+ <script>
+ document.getElementById('tab_{index}_{key_cleaned}').addEventListener('change', function() {{
+ if (this.checked) {{
+ document.getElementById('loading_{index}_{key_cleaned}').style.display = 'block';
+ document.getElementById('iframe_{index}_{key_cleaned}').src = '{url}';
+ }}
+ }});
+ </script>
+ """.format(
+ index=index, url=value, key_cleaned=key_cleaned
+ )
- html += f'<p><a href="{tweet["original_tweet_url"]}" target="_blank"><strong>Original Tweet↗</strong></a> · \n'
- html += f'<a href="{tweet["parsed_tweet_url"]}" target="_blank"><strong>Parsed Tweet↗</strong></a> · \n'
- html += f'<a href="{tweet["archived_tweet_url"]}" target="_blank"><strong>Archived Tweet↗</strong></a> · \n'
- html += f'<a href="{tweet["parsed_archived_tweet_url"]}" target="_blank"><strong>Parsed Archived Tweet↗</strong></a></p>\n'
+ html += "<br>\n"
+ html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
if tweet["available_tweet_text"]:
html += "<br>\n"
html += "<br>\n"
html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
- html += f'<p><strong>Archived Timestamp:</strong> {tweet["archived_timestamp"]}</p>\n'
- html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
+ html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
+ html += f'<p><strong>Archived mimetype: {tweet["archived_mimetype"]}</strong></p>\n'
html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
html += (
f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'
html += "</div>\n"
html += "</div>\n"
- html += '<h3>generated by <a href="https://github.com/claromes/waybacktweets" target="_blank">Wayback Tweets↗</a></h3>\n'
+ html += '<p class="source">generated by <a href="https://claromes.github.io/waybacktweets/" target="_blank">Wayback Tweets↗</a></p>\n'
html += "</body>\n</html>"
return html
FIELD_OPTIONS = [
"archived_urlkey",
"archived_timestamp",
- "original_tweet_url",
+ "parsed_archived_timestamp",
"archived_tweet_url",
- "parsed_tweet_url",
"parsed_archived_tweet_url",
+ "original_tweet_url",
+ "parsed_tweet_url",
"available_tweet_text",
"available_tweet_is_RT",
"available_tweet_info",
from waybacktweets.utils.utils import (
check_double_status,
check_pattern_tweet,
+ check_url_scheme,
clean_tweet_url,
clean_wayback_machine_url,
delete_tweet_pathnames,
get_response,
is_tweet_url,
semicolon_parser,
+ timestamp_parser,
)
Utility functions for handling HTTP requests and manipulating URLs.
"""
+import html
import re
+from datetime import datetime
from typing import Optional, Tuple
import requests
Returns:
Only the extracted URL from a tweet.
"""
- patterns = [
- re.compile(r'/status/"([^"]+)"'),
- re.compile(r'/status/"([^"]+)"'),
- re.compile(r'/status/%3B([^"]+)%3B'),
- ]
-
- for pattern in patterns:
- match = pattern.search(tweet_url)
- if match:
- return match.group(1).lstrip("/")
+ pattern = r'/status/((?:"(.*?)"|"(.*?)(?=&|$)|"%3B(.*?)(?=&|$)))'
+ match = re.search(pattern, tweet_url)
+
+ if match:
+ if match.group(2):
+ parsed_tweet_url = match.group(2)
+ elif match.group(3):
+ parsed_tweet_url = match.group(3)
+ elif match.group(4):
+ parsed_tweet_url = match.group(4)
else:
- return tweet_url
+ parsed_tweet_url = ""
+
+ parsed_tweet_url = html.unescape(parsed_tweet_url)
+
+ return parsed_tweet_url
+
+ return tweet_url
def delete_tweet_pathnames(tweet_url: str) -> str:
return True
return False
+
+
+def timestamp_parser(timestamp):
+ """
+ Parses a timestamp into a formatted string.
+
+ Args:
+ timestamp (str): The timestamp string to parse.
+
+ Returns:
+ The parsed timestamp in the format "%Y/%m/%d %H:%M:%S", or None if the
+ timestamp could not be parsed.
+ """
+ formats = [
+ "%Y",
+ "%Y%m",
+ "%Y%m%d",
+ "%Y%m%d%H",
+ "%Y%m%d%H%M",
+ "%Y%m%d%H%M%S",
+ ]
+
+ for fmt in formats:
+ try:
+ parsed_time = datetime.strptime(timestamp, fmt)
+
+ formatted_time = parsed_time.strftime("%Y/%m/%d %H:%M:%S")
+ return formatted_time
+ except ValueError:
+ continue
+
+ return None
+
+
+def check_url_scheme(url):
+ """
+ Corrects the URL scheme if it contains more than two slashes following the scheme.
+
+ This function uses a regular expression to find 'http:' or 'https:' followed by two or more slashes.
+ It then replaces this with the scheme followed by exactly two slashes.
+
+ Args:
+ url (str): The URL to be corrected.
+
+ Returns:
+ The corrected URL.
+ """ # noqa: E501
+ pattern = r"(http:|https:)(/{2,})"
+
+ def replace_function(match):
+ scheme = match.group(1)
+ return f"{scheme}//"
+
+ parsed_url = re.sub(pattern, replace_function, url)
+
+ return parsed_url