module-toolkit/generate-module.py at main · genepattern/module-toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python
"""
GenePattern Module Generator

A multi-agent system for automatically generating GenePattern modules from bioinformatics tools.
Uses Pydantic AI to orchestrate research, planning, and artifact generation.
"""

import os
import sys
import traceback
import argparse
from pathlib import Path
from typing import Dict, List
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

from agents.config import DEFAULT_OUTPUT_DIR, MAX_ARTIFACT_LOOPS, MAX_ESCALATIONS, configure_telemetry
from agents.example_data import ExampleDataItem, ExampleDataResolver
from agents.logger import Logger
from agents.module import ModuleAgent
from agents.status import ModuleGenerationStatus

# Enable telemetry with Logfire
configure_telemetry()


class GenerationScript:
    """
    Main script orchestration class for GenePattern module generation.
    Handles user input, argument parsing, and overall script coordination.
    """

    def __init__(self):
        """Initialize the generation script"""
        self.logger = Logger()
        self.args = None
        self.tool_info = None
        self.module_agent = None
        self.skip_artifacts = None

    def get_user_input(self) -> Dict[str, str]:
        """Prompt user for bioinformatics tool information"""
        self.logger.print_section("GenePattern Module Generator")
        print("This script will help you create a GenePattern module for a bioinformatics tool.")
        print("Please provide the following information:\n")

        tool_info = {}

        # Required fields
        tool_info['name'] = input("Tool name (e.g., 'samtools', 'bwa', 'star'): ").strip()
        if not tool_info['name']:
            print("Error: Tool name is required.")
            sys.exit(1)

        # Optional fields with defaults
        tool_info['version'] = input("Tool version (optional): ").strip() or "latest"
        tool_info['language'] = input("Primary language (python/r/java/c/cpp/other, optional): ").strip() or "unknown"
        tool_info['description'] = input("Brief description (optional): ").strip()
        tool_info['repository_url'] = input("Repository URL (optional): ").strip()
        tool_info['documentation_url'] = input("Documentation URL (optional): ").strip()
        tool_info['instructions'] = input("Additional instructions/context (optional): ").strip()
        tool_info['base_image'] = input("Known Docker base image (optional, e.g. 'broadinstitute/gatk:4.5.0.0'): ").strip()

        # Example data (optional)
        data_input = input("Example data files or URLs (space-separated, optional).\n"
                           "  Tip: append ::hint to clarify each file's role, e.g.:\n"
                           "    sample1.bam::tumor_sample sample2.bam::normal_sample hg38.fasta::reference\n"
                           "> ").strip()
        if data_input:
            raw_items = data_input.split()
            resolver = ExampleDataResolver(self.logger)
            tool_info['example_data'] = resolver.resolve(raw_items)
        else:
            tool_info['example_data'] = []

        return tool_info

    def parse_arguments(self):
        """Parse command line arguments"""
        parser = argparse.ArgumentParser(
            description="Generate complete GenePattern modules from bioinformatics tool information",
            formatter_class=argparse.RawDescriptionHelpFormatter,
            epilog="""
                Examples:
                  # Generate all artifacts (default)
                  python generate-module.py

                  # Skip specific artifacts
                  python generate-module.py --skip-dockerfile --skip-gpunit

                  # Generate only wrapper and manifest
                  python generate-module.py --artifacts wrapper manifest

                  # Skip container-related artifacts for local development
                  python generate-module.py --skip-dockerfile

                Available artifacts: wrapper, manifest, paramgroups, gpunit, documentation, dockerfile
            """
        )

        # Tool information
        parser.add_argument('--name', type=str, help='Tool name (e.g., "samtools")')
        parser.add_argument('--version', type=str, help='Tool version')
        parser.add_argument('--language', type=str, help='Primary language (e.g., "python")')
        parser.add_argument('--description', type=str, help='Brief description of the tool')
        parser.add_argument('--repository-url', type=str, help='URL of the source code repository')
        parser.add_argument('--documentation-url', type=str, help='URL of the tool documentation')
        parser.add_argument('--instructions', type=str, help='Additional instructions and context for module generation (e.g., which features to expose, which function to call)')
        parser.add_argument('--base-image', type=str, metavar='IMAGE',
                            help='Known Docker base image to use (e.g. "broadinstitute/gatk:4.5.0.0"). '
                                 'When provided this value is written directly into the plan\'s docker_image_tag '
                                 'field and passed to the Dockerfile agent, skipping the automatic image selection.')

        # Artifact skip flags
        parser.add_argument('--skip-wrapper', action='store_true', help='Skip generating wrapper script')
        parser.add_argument('--skip-manifest', action='store_true', help='Skip generating manifest file')
        parser.add_argument('--skip-paramgroups', action='store_true', help='Skip generating paramgroups.json file')
        parser.add_argument('--skip-gpunit', action='store_true', help='Skip generating GPUnit test file')
        parser.add_argument('--skip-documentation', action='store_true', help='Skip generating README.md documentation')
        parser.add_argument('--skip-dockerfile', action='store_true', help='Skip generating Dockerfile')

        # Alternative: specify only artifacts to generate
        parser.add_argument('--artifacts', nargs='+', choices=['wrapper', 'manifest', 'paramgroups', 'gpunit', 'documentation', 'dockerfile', 'none'], help="Generate only specified artifacts, or 'none' to skip all (alternative to --skip-* flags)")


        # Resume from previous run
        parser.add_argument('--resume', type=str, metavar='MODULE_DIR', help='Resume generation from a previous run using the specified module directory')

        # Max loops configuration
        parser.add_argument('--max-loops', type=int, metavar='X', default=MAX_ARTIFACT_LOOPS, help=f'Maximum number of generation attempts per artifact (default: {MAX_ARTIFACT_LOOPS})')

        # Max escalations configuration
        parser.add_argument('--max-escalations', type=int, metavar='N', default=MAX_ESCALATIONS, help=f'Maximum cross-artifact escalation attempts per artifact pair (default: {MAX_ESCALATIONS})')

        # Output directory
        parser.add_argument('--output-dir', default=DEFAULT_OUTPUT_DIR, type=str, help=f'Output directory for generated modules (default: {DEFAULT_OUTPUT_DIR})')

        # Pre-created module directory (used by the web UI to guarantee name consistency)
        parser.add_argument('--module-dir', type=str, metavar='PATH',
                            help='Use this pre-created directory as the module output directory instead of '
                                 'generating a new timestamped name under --output-dir.')

        # Zip options
        parser.add_argument('--no-zip', action='store_true', help='Skip creating a zip archive of artifact files')
        parser.add_argument('--zip-only', action='store_true', help='After creating zip archive, delete the individual artifact files (keeps only the zip)')

        # Docker push
        parser.add_argument('--docker-push', action='store_true', help='Push the Docker image to Docker Hub after building')

        # GenePattern upload
        parser.add_argument('--gp-server', type=str, metavar='URL',
                            default=os.getenv('GP_SERVER', 'https://beta.genepattern.org/gp'),
                            help='GenePattern server URL to upload the module zip to (default: https://beta.genepattern.org, or GP_SERVER env var)')
        parser.add_argument('--gp-user', type=str, metavar='USERNAME',
                            default=os.getenv('GP_USER', ''),
                            help='GenePattern username (or set GP_USER env var)')
        parser.add_argument('--gp-password', type=str, metavar='PASSWORD',
                            default=os.getenv('GP_PASSWORD', ''),
                            help='GenePattern password (or set GP_PASSWORD env var)')

        # Example data
        parser.add_argument('--data', nargs='+', metavar='PATH_OR_URL[::HINT]',
                            help='Example data files (local paths or HTTP/HTTPS URLs). '
                                 'Each entry may include an optional semantic hint after "::" '
                                 'to clarify the role of the file when multiple files share the '
                                 'same extension (e.g. sample1.bam::tumor_sample '
                                 'sample2.bam::normal_sample hg38.fasta::reference '
                                 'foo.vcf::germline_resource bar.vcf::panel_of_normals). '
                                 'Hints are shown to the LLM during planning and artifact '
                                 'generation, and are used by the runtime test to assign the '
                                 'correct file to each parameter when multiple files have the '
                                 'same extension. URLs are downloaded before planning so their '
                                 'contents can inform the LLM. Local files are used directly. '
                                 'All files are bind-mounted during the Dockerfile runtime test.')

        self.args = parser.parse_args()

    def tool_info_from_args(self):
        """Extract tool information from command line arguments"""
        self.tool_info = {
            'name': self.args.name,
            'version': self.args.version or "latest",
            'language': self.args.language or "unknown",
            'description': self.args.description or "",
            'repository_url': self.args.repository_url or "",
            'documentation_url': self.args.documentation_url or "",
            'instructions': self.args.instructions or "",
            'base_image': self.args.base_image or "",
            'example_data': [],
            'module_dir': self.args.module_dir or "",
        }
        # Resolve --data items if provided
        if self.args.data:
            resolver = ExampleDataResolver(self.logger)
            self.tool_info['example_data'] = resolver.resolve(self.args.data)

    def parse_skip_artifacts(self):
        """Determine which artifacts to skip based on command line arguments"""
        self.skip_artifacts = []
        all_artifacts = ['wrapper', 'manifest', 'paramgroups', 'gpunit', 'documentation', 'dockerfile']

        # If --artifacts specified, skip everything not in the list
        if self.args.artifacts:
            if 'none' in self.args.artifacts:
                self.skip_artifacts = all_artifacts
                self.logger.print_status("Skipping all artifact generation as '--artifacts none' was specified.")
            else:
                self.skip_artifacts = [artifact for artifact in all_artifacts if artifact not in self.args.artifacts]
                self.logger.print_status(f"Generating only: {', '.join(self.args.artifacts)}")
        else:
            # Use individual skip flags
            if self.args.skip_wrapper:       self.skip_artifacts.append('wrapper')
            if self.args.skip_manifest:      self.skip_artifacts.append('manifest')
            if self.args.skip_paramgroups:   self.skip_artifacts.append('paramgroups')
            if self.args.skip_gpunit:        self.skip_artifacts.append('gpunit')
            if self.args.skip_documentation: self.skip_artifacts.append('documentation')
            if self.args.skip_dockerfile:    self.skip_artifacts.append('dockerfile')

            if self.skip_artifacts:          self.logger.print_status(f"Skipping: {', '.join(self.skip_artifacts)}")

    def main(self):
        """Main entry point for module generation"""
        try:
            # Parse command line arguments
            self.parse_arguments()
            self.parse_skip_artifacts()

            # Initialize ModuleAgent with logger and module directory
            self.module_agent = ModuleAgent(self.logger, self.args.output_dir)

            # Check if resuming from a previous run
            if self.args.resume:
                self.logger.print_status(f"Resuming from previous run in directory: {self.args.resume}")
                status = self.module_agent.load_status(self.args.resume)

                # Resolve fresh --data override if provided on resume
                resume_example_data = None
                if self.args.data:
                    resolver = ExampleDataResolver(self.logger)
                    resume_example_data = resolver.resolve(self.args.data)
                    self.logger.print_status(f"--data override: {len(resume_example_data)} item(s) will replace persisted example_data")

                return self.module_agent.run(
                    skip_artifacts=self.skip_artifacts,
                    resume_status=status,
                    max_loops=self.args.max_loops,
                    no_zip=self.args.no_zip,
                    zip_only=self.args.zip_only,
                    docker_push=self.args.docker_push,
                    example_data=resume_example_data,
                    max_escalations=self.args.max_escalations,
                    gp_server=self.args.gp_server,
                    gp_user=self.args.gp_user,
                    gp_password=self.args.gp_password,
                )
            else:
                # Get tool information from args or user input
                if self.args.name:
                    self.tool_info_from_args()
                else:
                    self.tool_info = self.get_user_input()

                example_data = self.tool_info.pop('example_data', []) or []

                # Run the generation process
                return self.module_agent.run(
                    self.tool_info,
                    self.skip_artifacts,
                    max_loops=self.args.max_loops,
                    no_zip=self.args.no_zip,
                    zip_only=self.args.zip_only,
                    docker_push=self.args.docker_push,
                    example_data=example_data,
                    max_escalations=self.args.max_escalations,
                    gp_server=self.args.gp_server,
                    gp_user=self.args.gp_user,
                    gp_password=self.args.gp_password,
                )

        except KeyboardInterrupt:
            self.logger.print_status("\nGeneration interrupted by user", "WARNING")
            return 1
        except Exception as e:
            self.logger.print_status(f"Unexpected error: {str(e)}", "ERROR")
            self.logger.print_status(f"Traceback: {traceback.format_exc()}", "DEBUG")
            return 1

if __name__ == "__main__":
    script = GenerationScript()
    sys.exit(script.main())