From: Claromes Date: Sun, 16 Jun 2024 11:55:39 +0000 (-0300) Subject: update dirs X-Git-Url: https://git.claromes.com/?a=commitdiff_plain;h=4419b71916655f8d01c008cba840b0ea1909850b;p=waybacktweets.git update dirs --- diff --git a/LICENSE.md b/LICENSE.md index f240dca..624d376 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -631,7 +631,7 @@ to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. - Wayback Tweets - Retrieves archived tweets' CDX data from the Wayback Machine, performs necessary parsing, and saves the data. + Wayback Tweets - Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data. Copyright (C) 2023 Clarissa Mendes (Claromes) This program is free software: you can redistribute it and/or modify diff --git a/docs/cli.rst b/docs/cli.rst index f6f19fc..2a16040 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -4,7 +4,7 @@ CLI Usage --------- -.. click:: waybacktweets.cli.main:cli +.. click:: waybacktweets._cli:main :prog: waybacktweets :nested: full diff --git a/pyproject.toml b/pyproject.toml index 38b4706..abd71e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ max-line-length = 88 extend-ignore = ["E203", "E701"] [tool.poetry.scripts] -waybacktweets = 'waybacktweets.cli.main:cli' +waybacktweets = 'waybacktweets._cli:main' [build-system] requires = ["poetry-core"] diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py new file mode 100644 index 0000000..753b8f5 --- /dev/null +++ b/waybacktweets/_cli.py @@ -0,0 +1,135 @@ +""" +CLI functions for retrieving archived tweets. +""" + +from datetime import datetime +from typing import Any, Optional + +import click +from rich import print as rprint + +from waybacktweets.api.export_tweets import TweetsExporter +from waybacktweets.api.parse_tweets import TweetsParser +from waybacktweets.api.request_tweets import WaybackTweets + + +def parse_date( + ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None +) -> Optional[str]: + """ + Parses a date string and returns it in the format "YYYYMMDD". + + :param ctx: Necessary when used with the click package. Defaults to None. + :param param: Necessary when used with the click package. Defaults to None. + :param value: A date string in the "YYYYMMDD" format. Defaults to None. + + :returns: The input date string formatted in the "YYYYMMDD" format, + or None if no date string was provided. + """ + try: + if value is None: + return None + + date = datetime.strptime(value, "%Y%m%d") + + return date.strftime("%Y%m%d") + except ValueError: + raise click.BadParameter("Date must be in format YYYYmmdd") + + +@click.command() +@click.argument("username", type=str) +@click.option( + "--collapse", + type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False), + default=None, + help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501 +) +@click.option( + "--from", + "timestamp_from", + type=click.UNPROCESSED, + metavar="DATE", + callback=parse_date, + default=None, + help="Filtering by date range from this date. Format: YYYYmmdd", +) +@click.option( + "--to", + "timestamp_to", + type=click.UNPROCESSED, + metavar="DATE", + callback=parse_date, + default=None, + help="Filtering by date range up to this date. Format: YYYYmmdd", +) +@click.option( + "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits." +) +@click.option( + "--offset", + type=int, + metavar="INTEGER", + default=None, + help="Allows for a simple way to scroll through the results.", +) +@click.option( + "--matchtype", + type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False), + default=None, + help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501 +) +def main( + username: str, + collapse: Optional[str], + timestamp_from: Optional[str], + timestamp_to: Optional[str], + limit: Optional[int], + offset: Optional[int], + matchtype: Optional[str], +) -> None: + """ + Retrieves archived tweets CDX data from the Wayback Machine, + performs necessary parsing, and saves the data. + + USERNAME: The Twitter username without @. + """ + try: + api = WaybackTweets( + username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype + ) + + print("Making a request to the Internet Archive...") + archived_tweets = api.get() + + if archived_tweets: + field_options = [ + "archived_urlkey", + "archived_timestamp", + "original_tweet_url", + "archived_tweet_url", + "parsed_tweet_url", + "parsed_archived_tweet_url", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", + "archived_statuscode", + "archived_digest", + "archived_length", + ] + + parser = TweetsParser(archived_tweets, username, field_options) + parsed_tweets = parser.parse(print_progress=True) + + exporter = TweetsExporter(parsed_tweets, username, field_options) + + exporter.save_to_csv() + exporter.save_to_json() + exporter.save_to_html() + except Exception as e: + rprint(f"[red]{e}") + finally: + rprint( + "[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501 + ) diff --git a/waybacktweets/cli/__init__.py b/waybacktweets/cli/__init__.py deleted file mode 100644 index e38525e..0000000 --- a/waybacktweets/cli/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa: F401 - -from waybacktweets.cli.main import cli diff --git a/waybacktweets/cli/main.py b/waybacktweets/cli/main.py deleted file mode 100644 index 7c31e6a..0000000 --- a/waybacktweets/cli/main.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -CLI functions for retrieving archived tweets. -""" - -from datetime import datetime -from typing import Any, Optional - -import click -from rich import print as rprint - -from waybacktweets.api.export_tweets import TweetsExporter -from waybacktweets.api.parse_tweets import TweetsParser -from waybacktweets.api.request_tweets import WaybackTweets - - -def parse_date( - ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None -) -> Optional[str]: - """ - Parses a date string and returns it in the format "YYYYMMDD". - - :param ctx: Necessary when used with the click package. Defaults to None. - :param param: Necessary when used with the click package. Defaults to None. - :param value: A date string in the "YYYYMMDD" format. Defaults to None. - - :returns: The input date string formatted in the "YYYYMMDD" format, - or None if no date string was provided. - """ - try: - if value is None: - return None - - date = datetime.strptime(value, "%Y%m%d") - - return date.strftime("%Y%m%d") - except ValueError: - raise click.BadParameter("Date must be in format YYYYmmdd") - - -@click.command() -@click.argument("username", type=str) -@click.option( - "--collapse", - type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False), - default=None, - help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501 -) -@click.option( - "--from", - "timestamp_from", - type=click.UNPROCESSED, - metavar="DATE", - callback=parse_date, - default=None, - help="Filtering by date range from this date. Format: YYYYmmdd", -) -@click.option( - "--to", - "timestamp_to", - type=click.UNPROCESSED, - metavar="DATE", - callback=parse_date, - default=None, - help="Filtering by date range up to this date. Format: YYYYmmdd", -) -@click.option( - "--limit", type=int, metavar="INTEGER", default=None, help="Query result limits." -) -@click.option( - "--offset", - type=int, - metavar="INTEGER", - default=None, - help="Allows for a simple way to scroll through the results.", -) -@click.option( - "--matchtype", - type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False), - default=None, - help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501 -) -def cli( - username: str, - collapse: Optional[str], - timestamp_from: Optional[str], - timestamp_to: Optional[str], - limit: Optional[int], - offset: Optional[int], - matchtype: Optional[str], -) -> None: - """ - Retrieves archived tweets CDX data from the Wayback Machine, - performs necessary parsing, and saves the data. - - USERNAME: The Twitter username without @. - """ - try: - api = WaybackTweets( - username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype - ) - - print("Making a request to the Internet Archive...") - archived_tweets = api.get() - - if archived_tweets: - field_options = [ - "archived_urlkey", - "archived_timestamp", - "original_tweet_url", - "archived_tweet_url", - "parsed_tweet_url", - "parsed_archived_tweet_url", - "available_tweet_text", - "available_tweet_is_RT", - "available_tweet_info", - "archived_mimetype", - "archived_statuscode", - "archived_digest", - "archived_length", - ] - - parser = TweetsParser(archived_tweets, username, field_options) - parsed_tweets = parser.parse(print_progress=True) - - exporter = TweetsExporter(parsed_tweets, username, field_options) - - exporter.save_to_csv() - exporter.save_to_json() - exporter.save_to_html() - except Exception as e: - rprint(f"[red]{e}") - finally: - rprint( - "[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501 - ) diff --git a/waybacktweets/utils/__init__.py b/waybacktweets/utils/__init__.py index 6752ce2..8a76855 100644 --- a/waybacktweets/utils/__init__.py +++ b/waybacktweets/utils/__init__.py @@ -1,3 +1,12 @@ # flake8: noqa: F401 -from waybacktweets.utils.utils import * +from waybacktweets.utils.utils import ( + check_double_status, + check_pattern_tweet, + clean_tweet_url, + clean_wayback_machine_url, + delete_tweet_pathnames, + get_response, + is_tweet_url, + semicolon_parser, +)