v1.0a2 - update parser and viz, add field parsed_archived_timestamp, review poetry...
authorClaromes <claromes@hey.com>
Fri, 21 Jun 2024 14:42:07 +0000 (11:42 -0300)
committerClaromes <claromes@hey.com>
Fri, 21 Jun 2024 14:42:07 +0000 (11:42 -0300)
docs/api.rst
docs/contribute.rst
docs/field_options.rst
poetry.lock
pyproject.toml
waybacktweets/_cli.py
waybacktweets/api/parse.py
waybacktweets/api/visualize.py
waybacktweets/config/field_options.py
waybacktweets/utils/__init__.py
waybacktweets/utils/utils.py

index d0eb6151505865bb32a4ca1f5fef9f2622c0efb9..b068e1078f7146bf7c7d404e95cf438adbb1e0b7 100644 (file)
@@ -55,12 +55,14 @@ Utils
 
 .. autofunction:: check_double_status
 .. autofunction:: check_pattern_tweet
+.. autofunction:: check_url_scheme
 .. autofunction:: clean_tweet_url
 .. autofunction:: clean_wayback_machine_url
 .. autofunction:: delete_tweet_pathnames
 .. autofunction:: get_response
 .. autofunction:: is_tweet_url
 .. autofunction:: semicolon_parser
+.. autofunction:: timestamp_parser
 
 Exceptions
 ------------
index 6bfb7cc6203575051cfa0ba1093602984923b89c..87dffee1fe805bfe4469c0f1cb8c4012b41d632a 100644 (file)
@@ -16,7 +16,7 @@ If you have Python skills, contribute to the `code <https://github.com/claromes/
 
 These are the prerequisites:
 
-- Python 3.11+
+- Python 3.10+
 - Poetry
 
 Install from the source, following the :ref:`installation` instructions.
index 358e7faff49c57134657681f3710be4fc884737a..3c4a0aec844c0f5d30f15bf1b683b10e69441c64 100644 (file)
@@ -7,15 +7,17 @@ The package performs several parses to facilitate the analysis of archived tweet
 
 - ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
 
-- ``archived_timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
+- ``archived_timestamp``: (`str`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
 
-- ``original_tweet_url``: (`str`) The original tweet URL.
+- ``parsed_archived_timestamp``: (`str`) The ``archived_timestamp`` in human-readable format.
 
-- ``archived_tweet_url``: (`str`) The original archived URL.
+- ``archived_tweet_url``: (`str`) The archived URL.
 
-- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary.  Check the :ref:`utils`.
+- ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
 
-- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
+- ``original_tweet_url``: (`str`) The original tweet URL.
+
+- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary.  Check the :ref:`utils`.
 
 - ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account.
 
index 740b1532acc8c360c40976e490159712c046fa98..d0b3de95f1aafc3cc8b0c2deedc7f1056d8bcb3a 100644 (file)
@@ -29,6 +29,7 @@ numpy = "*"
 packaging = "*"
 pandas = ">=0.25"
 toolz = "*"
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 
 [package.extras]
 all = ["altair-tiles (>=0.3.0)", "anywidget (>=0.9.0)", "pyarrow (>=11)", "vega-datasets (>=0.9.0)", "vegafusion[embed] (>=1.6.6)", "vl-convert-python (>=1.3.0)"]
@@ -105,6 +106,8 @@ mypy-extensions = ">=0.4.3"
 packaging = ">=22.0"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 
 [package.extras]
 colorama = ["colorama (>=0.4.3)"]
@@ -304,18 +307,18 @@ files = [
 
 [[package]]
 name = "filelock"
-version = "3.15.1"
+version = "3.15.3"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.15.1-py3-none-any.whl", hash = "sha256:71b3102950e91dfc1bb4209b64be4dc8854f40e5f534428d8684f953ac847fac"},
-    {file = "filelock-3.15.1.tar.gz", hash = "sha256:58a2549afdf9e02e10720eaa4d4470f56386d7a6f72edd7d0596337af8ed7ad8"},
+    {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"},
+    {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"},
 ]
 
 [package.extras]
 docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"]
 typing = ["typing-extensions (>=4.8)"]
 
 [[package]]
@@ -346,6 +349,7 @@ files = [
 
 [package.dependencies]
 Flake8 = ">=5"
+TOMLi = {version = "*", markers = "python_version < \"3.11\""}
 
 [package.extras]
 dev = ["pyTest", "pyTest-cov"]
@@ -732,6 +736,7 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
     {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
@@ -1326,6 +1331,7 @@ sphinxcontrib-htmlhelp = ">=2.0.0"
 sphinxcontrib-jsmath = "*"
 sphinxcontrib-qthelp = "*"
 sphinxcontrib-serializinghtml = ">=1.1.9"
+tomli = {version = ">=2", markers = "python_version < \"3.11\""}
 
 [package.extras]
 docs = ["sphinxcontrib-websupport"]
@@ -1334,13 +1340,13 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=6.0)", "setuptools
 
 [[package]]
 name = "sphinx-autodoc-typehints"
-version = "2.1.1"
+version = "2.2.0"
 description = "Type hints (PEP 484) support for the Sphinx autodoc extension"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "sphinx_autodoc_typehints-2.1.1-py3-none-any.whl", hash = "sha256:22427d74786274add2b6d4afccb8b3c8c1843f48a704550f15a35fd948f8a4de"},
-    {file = "sphinx_autodoc_typehints-2.1.1.tar.gz", hash = "sha256:0072b65f5ab2818c229d6d6c2cc993770af55d36bb7bfb16001e2fce4d14880c"},
+    {file = "sphinx_autodoc_typehints-2.2.0-py3-none-any.whl", hash = "sha256:143e22dbb096cc39f1559d3accbe423e5fbf04d02849d6564e6471b5616bbd97"},
+    {file = "sphinx_autodoc_typehints-2.2.0.tar.gz", hash = "sha256:a21f0120d8657545ad5ec269d7276b0718c367c8ff2fa8ad8767ddf2c660b909"},
 ]
 
 [package.dependencies]
@@ -1570,6 +1576,17 @@ files = [
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
 
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
 [[package]]
 name = "toolz"
 version = "0.12.1"
@@ -1706,5 +1723,5 @@ watchmedo = ["PyYAML (>=3.10)"]
 
 [metadata]
 lock-version = "2.0"
-python-versions = "^3.11"
-content-hash = "37fcbc9255674bf67e65a2db35dbd71355fc97751141e739f31bb50fe708aa04"
+python-versions = "^3.10"
+content-hash = "4b34e093fd7034c803ee2d6b2a5598666e343c5b2562c2a2244d1528214bcacd"
index dfc3e15350cb0bb44e9100dace4f78a22052fe35..2085a840d9fe1da89731b6aa0e17b5e81b57b908 100644 (file)
@@ -1,12 +1,10 @@
 [tool.poetry]
 name = "waybacktweets"
-version = "1.0a1"
+version = "1.0a2"
 description = "Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data."
 authors = ["Claromes <support@claromes.com>"]
 license = "GPLv3"
 readme = "README.md"
-repository = "https://github.com/claromes/waybacktweets"
-documentation = "https://claromes.github.io/waybacktweets/"
 keywords = [
     "twitter",
     "tweet",
@@ -22,6 +20,7 @@ classifiers = [
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
     "Natural Language :: English",
+    "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Topic :: Software Development",
     "Topic :: Utilities",
@@ -29,13 +28,13 @@ classifiers = [
 exclude = ["app/**", "assets/**", "docs/**", ".streamlit/**"]
 
 [tool.poetry.urls]
+"Homepage" = "https://claromes.github.io/waybacktweets/"
 "Documentation" = "https://claromes.github.io/waybacktweets/"
 "Issue Tracker" = "https://github.com/claromes/waybacktweets/issues"
 
 [tool.poetry.dependencies]
-python = "^3.11"
+python = "^3.10"
 requests = "^2.30.0"
-streamlit = "1.35.0"
 rich = "^13.6.0"
 click = "^8.1.7"
 
@@ -48,6 +47,7 @@ sphinx-click = "^6.0.0"
 sphinx-autodoc-typehints = "^2.1.1"
 
 [tool.poetry.group.dev.dependencies]
+streamlit = "1.35.0"
 black = "^24.4.2"
 flake8 = "^7.0.0"
 isort = "^5.13.2"
index d189c203fae3e4df657e81403f3d98615e937322..d115c09bd30ac8faef0d28d44b54efb9e1208d24 100644 (file)
@@ -128,10 +128,11 @@ def main(
             field_options = [
                 "archived_urlkey",
                 "archived_timestamp",
-                "original_tweet_url",
+                "parsed_archived_timestamp",
                 "archived_tweet_url",
-                "parsed_tweet_url",
                 "parsed_archived_tweet_url",
+                "original_tweet_url",
+                "parsed_tweet_url",
                 "available_tweet_text",
                 "available_tweet_is_RT",
                 "available_tweet_info",
index 19228f013ab8ea75a337b8748d37e575748fc1ee..c6d3510618e474219dc64d6ab5c87d1e5d4ac406 100644 (file)
@@ -21,11 +21,13 @@ from waybacktweets.exceptions.exceptions import (
 from waybacktweets.utils.utils import (
     check_double_status,
     check_pattern_tweet,
+    check_url_scheme,
     clean_tweet_url,
     delete_tweet_pathnames,
     get_response,
     is_tweet_url,
     semicolon_parser,
+    timestamp_parser,
 )
 
 
@@ -203,23 +205,26 @@ class TweetsParser:
         original_tweet = delete_tweet_pathnames(
             clean_tweet_url(cleaned_tweet, self.username)
         )
-        parsed_wayback_machine_url = (
-            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
-        )
 
         double_status = check_double_status(wayback_machine_url, original_tweet)
 
         if double_status:
             original_tweet = delete_tweet_pathnames(
-                f"https://twitter.com/{original_tweet}"
+                f"https://twitter.com{original_tweet}"
             )
         elif "://" not in original_tweet:
             original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
 
-        encoded_tweet = semicolon_parser(response[2])
-        encoded_archived_tweet = semicolon_parser(wayback_machine_url)
-        encoded_parsed_tweet = semicolon_parser(original_tweet)
-        encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+        parsed_wayback_machine_url = (
+            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+        )
+
+        encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url))
+        encoded_parsed_archived_tweet = check_url_scheme(
+            semicolon_parser(parsed_wayback_machine_url)
+        )
+        encoded_tweet = check_url_scheme(semicolon_parser(response[2]))
+        encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet))
 
         available_tweet_text = None
         available_tweet_is_RT = None
@@ -242,10 +247,11 @@ class TweetsParser:
 
         self._add_field("archived_urlkey", response[0])
         self._add_field("archived_timestamp", response[1])
-        self._add_field("original_tweet_url", encoded_tweet)
+        self._add_field("parsed_archived_timestamp", timestamp_parser(response[1]))
         self._add_field("archived_tweet_url", encoded_archived_tweet)
-        self._add_field("parsed_tweet_url", encoded_parsed_tweet)
         self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+        self._add_field("original_tweet_url", encoded_tweet)
+        self._add_field("parsed_tweet_url", encoded_parsed_tweet)
         self._add_field("archived_mimetype", response[3])
         self._add_field("archived_statuscode", response[4])
         self._add_field("archived_digest", response[5])
index 70824e7e50e9f67f4f55bedff41cb773a5f97e3a..369e0f19b2d25709ee2fd8b2b54018bcbdedda39 100644 (file)
@@ -6,6 +6,8 @@ Generates an HTML file to visualize the parsed data.
 import json
 from typing import Any, Dict, List
 
+from waybacktweets.utils import timestamp_parser
+
 
 class HTMLTweetsVisualizer:
     """
@@ -44,35 +46,100 @@ class HTMLTweetsVisualizer:
             The generated HTML string.
         """
 
-        html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
+        html = f"<html>\n<!-- This content was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
+        html += f"\n<head>\n<title>@{self.username}'s archived tweets</title>\n"
         html += "<style>\n"
         html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
         html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
-        html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #fff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; }\n"
+        html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
         html += ".tweet strong { font-weight: bold; }\n"
-        html += ".tweet a { color: #ef5552; text-decoration: none; }\n"
-        html += ".content { color: #ef5552; }\n"
+        html += ".tweet a { color: #000000; text-decoration: none; }\n"
+        html += ".content { color: #000000; }\n"
+        html += ".source { font-size: 12px; text-align: center; }\n"
+        html += ".iframe_text { font-size: 12px; text-align: end; }\n"
         html += ".tweet a:hover { text-decoration: underline; }\n"
         html += "h1, h3 { text-align: center; }\n"
         html += "iframe { width: 600px; height: 600px; }\n"
+        html += "input {\n"
+        html += "position: absolute;\n"
+        html += "opacity: 0;\n"
+        html += "z-index: -1;\n"
+        html += "}\n"
+        html += ".accordion {\n"
+        html += "margin: 10px;\n"
+        html += "border-radius: 5px;\n"
+        html += "overflow: hidden;\n"
+        html += "box-shadow: 0 4px 4px -2px rgba(0, 0, 0, 0.4);\n"
+        html += "}\n"
+        html += ".accordion-label {\n"
+        html += "display: flex;\n"
+        html += "justify-content: space-between;\n"
+        html += "padding: 1em;\n"
+        html += "font-weight: bold;\n"
+        html += "cursor: pointer;\n"
+        html += "background: #000000;\n"
+        html += "color: #ffffff;\n"
+        html += "}\n"
+        html += ".accordion-content {\n"
+        html += "max-height: 0;\n"
+        html += "padding: 0 1em;\n"
+        html += "background: white;\n"
+        html += "transition: all 0.35s;\n"
+        html += "}\n"
+        html += "input:checked ~ .accordion-content {\n"
+        html += "max-height: 100vh;\n"
+        html += " padding: 1em;\n"
+        html += "}\n"
         html += "</style>\n"
         html += "</head>\n<body>\n"
-        html += f"<h1>@{self.username} archived tweets</h1>\n"
+        html += f"<h1>@{self.username}'s archived tweets</h1>\n"
         html += '<div class="container">\n'
 
-        for tweet in self.json_file_path:
+        for index, tweet in enumerate(self.json_file_path):
             html += '<div class="tweet">\n'
 
             if (
                 tweet["archived_mimetype"] != "application/json"
                 and not tweet["available_tweet_text"]
             ):
-                html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
+                iframe_src = {
+                    "Archived Tweet": tweet["archived_tweet_url"],
+                    "Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
+                    "Original Tweet": tweet["original_tweet_url"],
+                    "Parsed Tweet": tweet["parsed_tweet_url"],
+                }
+
+                for key, value in iframe_src.items():
+                    key_cleaned = key.replace(" ", "_")
+
+                    html += f'<p class="iframe_text"><a href="{value}" target="_blank"><strong>{key}↗</strong></a>\n'
+                    html += '<div class="accordion">\n'
+                    html += (
+                        f'<input type="checkbox" id="tab_{index}_{key_cleaned}" />\n'
+                    )
+                    html += f'<label class="accordion-label" for="tab_{index}_{key_cleaned}">Click to load the iframe from {key}</label>\n'
+                    html += '<div class="accordion-content">\n'
+
+                    html += f'<div id="loading_{index}_{key_cleaned}" class="loading">Loading...</div>\n'
+                    html += f'<iframe id="iframe_{index}_{key_cleaned}" frameborder="0" scrolling="auto" loading="lazy" style="display: none;" onload="document.getElementById(\'loading_{index}_{key_cleaned}\').style.display=\'none\'; this.style.display=\'block\';"></iframe>\n'
+                    html += "</div>\n"
+                    html += "</div>\n"
+
+                    html += """
+                    <script>
+                    document.getElementById('tab_{index}_{key_cleaned}').addEventListener('change', function() {{
+                        if (this.checked) {{
+                            document.getElementById('loading_{index}_{key_cleaned}').style.display = 'block';
+                            document.getElementById('iframe_{index}_{key_cleaned}').src = '{url}';
+                        }}
+                    }});
+                    </script>
+                    """.format(
+                        index=index, url=value, key_cleaned=key_cleaned
+                    )
 
-            html += f'<p><a href="{tweet["original_tweet_url"]}" target="_blank"><strong>Original Tweet↗</strong></a> · \n'
-            html += f'<a href="{tweet["parsed_tweet_url"]}" target="_blank"><strong>Parsed Tweet↗</strong></a> · \n'
-            html += f'<a href="{tweet["archived_tweet_url"]}" target="_blank"><strong>Archived Tweet↗</strong></a> · \n'
-            html += f'<a href="{tweet["parsed_archived_tweet_url"]}" target="_blank"><strong>Parsed Archived Tweet↗</strong></a></p>\n'
+                html += "<br>\n"
+                html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
 
             if tweet["available_tweet_text"]:
                 html += "<br>\n"
@@ -82,8 +149,8 @@ class HTMLTweetsVisualizer:
 
             html += "<br>\n"
             html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
-            html += f'<p><strong>Archived Timestamp:</strong> {tweet["archived_timestamp"]}</p>\n'
-            html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
+            html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
+            html += f'<p><strong>Archived mimetype: {tweet["archived_mimetype"]}</strong></p>\n'
             html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
             html += (
                 f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'
@@ -94,7 +161,7 @@ class HTMLTweetsVisualizer:
             html += "</div>\n"
 
         html += "</div>\n"
-        html += '<h3>generated by <a href="https://github.com/claromes/waybacktweets" target="_blank">Wayback Tweets↗</a></h3>\n'
+        html += '<p class="source">generated by <a href="https://claromes.github.io/waybacktweets/" target="_blank">Wayback Tweets↗</a></p>\n'
         html += "</body>\n</html>"
 
         return html
index 9c1fcb6b0122e263cb8aad85711b754dcc333d83..1d36f031c6f85f9e7af901846e7e53cdadaaa193 100644 (file)
@@ -5,10 +5,11 @@ List of valid field options that can be used for parsing tweets.
 FIELD_OPTIONS = [
     "archived_urlkey",
     "archived_timestamp",
-    "original_tweet_url",
+    "parsed_archived_timestamp",
     "archived_tweet_url",
-    "parsed_tweet_url",
     "parsed_archived_tweet_url",
+    "original_tweet_url",
+    "parsed_tweet_url",
     "available_tweet_text",
     "available_tweet_is_RT",
     "available_tweet_info",
index 8a7685589280c2c61bb91c392c44e158932633f0..a6f3f7aecd97b35663dae81ad3905d978dcb2339 100644 (file)
@@ -3,10 +3,12 @@
 from waybacktweets.utils.utils import (
     check_double_status,
     check_pattern_tweet,
+    check_url_scheme,
     clean_tweet_url,
     clean_wayback_machine_url,
     delete_tweet_pathnames,
     get_response,
     is_tweet_url,
     semicolon_parser,
+    timestamp_parser,
 )
index 89be2b68301259492a916e79b377721ab13ca78f..52f6bc47b307afebdc074073aecf9a1d270c27fc 100644 (file)
@@ -2,7 +2,9 @@
 Utility functions for handling HTTP requests and manipulating URLs.
 """
 
+import html
 import re
+from datetime import datetime
 from typing import Optional, Tuple
 
 import requests
@@ -126,18 +128,24 @@ def check_pattern_tweet(tweet_url: str) -> str:
     Returns:
         Only the extracted URL from a tweet.
     """
-    patterns = [
-        re.compile(r'/status/"([^"]+)"'),
-        re.compile(r'/status/&quot;([^"]+)&quot;'),
-        re.compile(r'/status/%3B([^"]+)%3B'),
-    ]
-
-    for pattern in patterns:
-        match = pattern.search(tweet_url)
-        if match:
-            return match.group(1).lstrip("/")
+    pattern = r'/status/((?:"(.*?)"|&quot;(.*?)(?=&|$)|&quot%3B(.*?)(?=&|$)))'
+    match = re.search(pattern, tweet_url)
+
+    if match:
+        if match.group(2):
+            parsed_tweet_url = match.group(2)
+        elif match.group(3):
+            parsed_tweet_url = match.group(3)
+        elif match.group(4):
+            parsed_tweet_url = match.group(4)
         else:
-            return tweet_url
+            parsed_tweet_url = ""
+
+        parsed_tweet_url = html.unescape(parsed_tweet_url)
+
+        return parsed_tweet_url
+
+    return tweet_url
 
 
 def delete_tweet_pathnames(tweet_url: str) -> str:
@@ -213,3 +221,59 @@ def is_tweet_url(twitter_url: str) -> bool:
         return True
 
     return False
+
+
+def timestamp_parser(timestamp):
+    """
+    Parses a timestamp into a formatted string.
+
+    Args:
+        timestamp (str): The timestamp string to parse.
+
+    Returns:
+        The parsed timestamp in the format "%Y/%m/%d %H:%M:%S", or None if the
+        timestamp could not be parsed.
+    """
+    formats = [
+        "%Y",
+        "%Y%m",
+        "%Y%m%d",
+        "%Y%m%d%H",
+        "%Y%m%d%H%M",
+        "%Y%m%d%H%M%S",
+    ]
+
+    for fmt in formats:
+        try:
+            parsed_time = datetime.strptime(timestamp, fmt)
+
+            formatted_time = parsed_time.strftime("%Y/%m/%d %H:%M:%S")
+            return formatted_time
+        except ValueError:
+            continue
+
+    return None
+
+
+def check_url_scheme(url):
+    """
+    Corrects the URL scheme if it contains more than two slashes following the scheme.
+
+    This function uses a regular expression to find 'http:' or 'https:' followed by two or more slashes.
+    It then replaces this with the scheme followed by exactly two slashes.
+
+    Args:
+        url (str): The URL to be corrected.
+
+    Returns:
+        The corrected URL.
+    """  # noqa: E501
+    pattern = r"(http:|https:)(/{2,})"
+
+    def replace_function(match):
+        scheme = match.group(1)
+        return f"{scheme}//"
+
+    parsed_url = re.sub(pattern, replace_function, url)
+
+    return parsed_url