crawler-to-md/main.py at 934efa4e45356bb66d888ce230ee0a5aa83f5fad · obeone/crawler-to-md · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import src.log_setup as log_setup
import os
import argparse
import src.utils as utils
import sys

# Setup logging based on environment variable or default to WARN level before importing other modules.
log_level = os.getenv("LOG_LEVEL", "WARN")
log_setup.setup_logging(log_level)

from src.database_manager import DatabaseManager
from src.export_manager import ExportManager
from src.scraper import Scraper

logger = log_setup.get_logger()
logger.name = "main"


def main():
    """
    Main function to start the web scraper application.

    This function parses command line arguments, initializes necessary components,
    and manages the scraping and exporting process.

    Raises:
        ValueError: If neither a URL nor a URLs file is provided.
    """
    logger.info("Starting the web scraper application.")

    # Parse command line arguments
    parser = argparse.ArgumentParser(description="Web Scraper to Markdown")
    parser.add_argument("--url", "-u", help="Base URL to start scraping")
    parser.add_argument(
        "--urls-file",
        help="Path to a file containing URLs to scrape, one URL per line. If '-', read from stdin.",
    )
    parser.add_argument(
        "--output-folder", "-o", help="Output folder for the markdown file", default="./output"
    )
    parser.add_argument(
        "--cache-folder", "-c", help="Cache folder for storing database", default="./cache"
    )
    parser.add_argument(
        "--base-url", "-b", help="Base URL for filtering links. Defaults to the URL base"
    )
    parser.add_argument(
        "--title", "-t", help="Final title of the markdown file. Defaults to the URL"
    )
    parser.add_argument(
        "--exclude",
        "-e",
        action="append",
        help="Exclude URLs containing this string",
        default=[],
    )
    parser.add_argument(
        "--export-individual",
        "-ei",
        action="store_true",
        help="Export each page as an individual Markdown file",
        default=False,
    )
    parser.add_argument(
        "--rate-limit",
        "-rl",
        type=int,
        help="Maximum number of requests per minute",
        default=0,
    )
    parser.add_argument(
        "--delay",
        "-d",
        type=float,
        help="Delay between requests in seconds",
        default=0,
    )

    try:
        import argcomplete

        argcomplete.autocomplete(parser)
    except ImportError:
        pass

    args = parser.parse_args()
    logger.debug(f"Command line arguments parsed: {args}")

    # Read URLs from a file or stdin
    if args.urls_file:
        if args.urls_file == "-":
            print("Enter URLs, one per line (Ctrl-D to finish):")
            urls_list = [line.strip() for line in sys.stdin]
        else:
            with open(args.urls_file, "r") as file:
                urls_list = [line.strip() for line in file.readlines()]

        urls_list = utils.deduplicate_list(urls_list)
        args.url = None  # Ensure args.url is defined even if not used
    else:
        urls_list = []

    if not args.url and not urls_list:
        raise ValueError("No URL provided. Please provide either --url or --urls-file.")

    output = os.path.join(
        args.output_folder,
        utils.url_to_filename(args.url) if args.url else utils.url_to_filename(urls_list[0]),
    )

    # Create the output folder if it does not exist
    if not os.path.exists(output):
        logger.info(f"Creating output folder at {output}")
        os.makedirs(output)

    # Create the cache folder if it does not exist
    if not os.path.exists(args.cache_folder):
        logger.info(f"Creating cache folder at {args.cache_folder}")
        os.makedirs(args.cache_folder)

    # If no base url, set it to the url base
    if not args.base_url:
        if not args.urls_file:
            args.base_url = utils.url_dirname(args.url if args.url else urls_list[0])
        logger.debug(f"No base URL provided. Setting base URL to {args.base_url}")

    # If no title, set it to the url base
    if not args.title:
        args.title = args.url if args.url else urls_list[0]
        logger.debug(f"No title provided. Setting title to {args.title}")

    # Initialize managers
    db_manager = DatabaseManager(
        os.path.join(
            args.cache_folder,
            utils.url_to_filename(args.url if args.url else urls_list[0]) + ".sqlite",
        )
    )
    logger.info("DatabaseManager initialized.")

    scraper = Scraper(
        base_url=args.base_url,
        exclude_patterns=args.exclude,
        db_manager=db_manager,
        rate_limit=args.rate_limit,
        delay=args.delay,
    )
    logger.info("Scraper initialized.")

    # Start the scraping process
    logger.info(f"Starting the scraping process for URL: {args.url}")
    scraper.start_scraping(url=args.url, urls_list=urls_list)

    output_name = utils.randomstring_to_filename(args.title)

    # After the scraping process is completed in the main function
    export_manager = ExportManager(db_manager, args.title)
    logger.info("ExportManager initialized.")

    export_manager.export_to_markdown(os.path.join(output, f"{output_name}.md"))
    logger.info("Export to markdown completed.")

    export_manager.export_to_json(os.path.join(output, f"{output_name}.json"))
    logger.info("Export to JSON completed.")

    if args.export_individual:
        logger.info("Export of individual pages...")
        output_folder_ei = export_manager.export_individual_markdown(
            output_folder=output, base_url=args.base_url if args.base_url else None
        )
        logger.info("Export of individual Markdown files completed.")

    markdown_path = os.path.join(output, f"{output_name}.md")
    json_path = os.path.join(output, f"{output_name}.json")
    print(f"\033[94m Markdown file generated at: \033[0m", markdown_path)
    print(f"\033[92m JSON file generated at: \033[0m", json_path)
    if args.export_individual:
        print(f"\033[95m Individual Markdown files exported to: \033[0m", output_folder_ei)


if __name__ == "__main__":
    main()