|
| 1 | +""" |
| 2 | +Seed script: connects to Teradata, extracts one-line summaries from teradataml |
| 3 | +__init__.__doc__ for all TD_ANALYTIC_FUNCS, then prints the new dict[str, str] |
| 4 | +block ready to paste into constants.py. |
| 5 | +
|
| 6 | +Requires DATABASE_URI env var: |
| 7 | + export DATABASE_URI="teradata://user:pass@host:1025/db" |
| 8 | + uv run python scripts/seed_tdml_summaries.py |
| 9 | +""" |
| 10 | + |
| 11 | +import os |
| 12 | +import re |
| 13 | +import textwrap |
| 14 | +import warnings |
| 15 | + |
| 16 | +warnings.filterwarnings("ignore") |
| 17 | + |
| 18 | +import teradataml as tdml # noqa: E402 |
| 19 | + |
| 20 | +# Connect so that teradataml populates __init__.__doc__ on each class |
| 21 | +_uri = os.environ.get("DATABASE_URI", "") |
| 22 | +if _uri: |
| 23 | + _m = re.match(r"teradata://([^:]+):([^@]+)@([^:]+):(\d+)/(.+)", _uri) |
| 24 | + if _m: |
| 25 | + tdml.create_context( |
| 26 | + host=_m.group(3), |
| 27 | + username=_m.group(1), |
| 28 | + password=_m.group(2), |
| 29 | + database=_m.group(5), |
| 30 | + ) |
| 31 | + else: |
| 32 | + raise ValueError(f"Cannot parse DATABASE_URI: {_uri}") |
| 33 | +else: |
| 34 | + raise EnvironmentError("DATABASE_URI not set — docstrings require a live connection") |
| 35 | + |
| 36 | +FUNCS = [ |
| 37 | + "ANOVA", |
| 38 | + "Attribution", |
| 39 | + "Antiselect", |
| 40 | + "Apriori", |
| 41 | + "BincodeFit", |
| 42 | + "BincodeTransform", |
| 43 | + "CFilter", |
| 44 | + "CategoricalSummary", |
| 45 | + "ChiSq", |
| 46 | + "ClassificationEvaluator", |
| 47 | + "ColumnSummary", |
| 48 | + "ColumnTransformer", |
| 49 | + "ConvertTo", |
| 50 | + "DecisionForest", |
| 51 | + "FTest", |
| 52 | + "FillRowId", |
| 53 | + "Fit", |
| 54 | + "GetFutileColumns", |
| 55 | + "GetRowsWithMissingValues", |
| 56 | + "GetRowsWithoutMissingValues", |
| 57 | + "GLM", |
| 58 | + "GLMPerSegment", |
| 59 | + "Histogram", |
| 60 | + "KMeans", |
| 61 | + "KMeansPredict", |
| 62 | + "KNN", |
| 63 | + "MovingAverage", |
| 64 | + "NERExtractor", |
| 65 | + "NGramSplitter", |
| 66 | + "NaiveBayesTextClassifierPredict", |
| 67 | + "NaiveBayesTextClassifierTrainer", |
| 68 | + "NonLinearCombineFit", |
| 69 | + "NonLinearCombineTransform", |
| 70 | + "NumApply", |
| 71 | + "NPath", |
| 72 | + "OneClassSVM", |
| 73 | + "OneClassSVMPredict", |
| 74 | + "OneHotEncodingFit", |
| 75 | + "OneHotEncodingTransform", |
| 76 | + "OrdinalEncodingFit", |
| 77 | + "OrdinalEncodingTransform", |
| 78 | + "OutlierFilterFit", |
| 79 | + "OutlierFilterTransform", |
| 80 | + "Pack", |
| 81 | + "PolynomialFeaturesFit", |
| 82 | + "PolynomialFeaturesTransform", |
| 83 | + "Pivoting", |
| 84 | + "QQNorm", |
| 85 | + "ROC", |
| 86 | + "RandomProjectionFit", |
| 87 | + "RandomProjectionMinComponents", |
| 88 | + "RandomProjectionTransform", |
| 89 | + "RegressionEvaluator", |
| 90 | + "RoundColumns", |
| 91 | + "RowNormalizeFit", |
| 92 | + "RowNormalizeTransform", |
| 93 | + "SMOTE", |
| 94 | + "SVM", |
| 95 | + "SVMPredict", |
| 96 | + "ScaleFit", |
| 97 | + "ScaleTransform", |
| 98 | + "Sessionize", |
| 99 | + "SentimentExtractor", |
| 100 | + "Shap", |
| 101 | + "Silhouette", |
| 102 | + "SimpleImputeFit", |
| 103 | + "SimpleImputeTransform", |
| 104 | + "StrApply", |
| 105 | + "StringSimilarity", |
| 106 | + "TDDecisionForestPredict", |
| 107 | + "TDGLMPredict", |
| 108 | + "TDNaiveBayesPredict", |
| 109 | + "TFIDF", |
| 110 | + "TargetEncodingFit", |
| 111 | + "TargetEncodingTransform", |
| 112 | + "TextMorph", |
| 113 | + "TextParser", |
| 114 | + "TrainTestSplit", |
| 115 | + "Transform", |
| 116 | + "UnivariateStatistics", |
| 117 | + "Unpack", |
| 118 | + "Unpivoting", |
| 119 | + "VectorDistance", |
| 120 | + "WhichMax", |
| 121 | + "WhichMin", |
| 122 | + "WordEmbeddings", |
| 123 | + "XGBoost", |
| 124 | + "XGBoostPredict", |
| 125 | + "ZTest", |
| 126 | +] |
| 127 | + |
| 128 | + |
| 129 | +def extract_summary(func_name: str) -> str: |
| 130 | + """Pull the first meaningful sentence from the teradataml __init__ docstring.""" |
| 131 | + func_obj = getattr(tdml, func_name, None) |
| 132 | + if func_obj is None: |
| 133 | + return f"Teradata ML analytic function {func_name}." |
| 134 | + |
| 135 | + raw = getattr(func_obj.__init__, "__doc__", None) or "" |
| 136 | + # Dedent and strip leading blank lines |
| 137 | + raw = textwrap.dedent(raw).strip() |
| 138 | + |
| 139 | + # The teradataml pattern is: |
| 140 | + # DESCRIPTION: |
| 141 | + # <summary text, may span multiple lines> |
| 142 | + # |
| 143 | + # PARAMETERS: |
| 144 | + # Try to grab the DESCRIPTION block first. |
| 145 | + desc_match = re.search(r"DESCRIPTION\s*:\s*\n(.*?)(?:\n\s*\n|\n\s*PARAMETERS\s*:)", raw, re.DOTALL) |
| 146 | + if desc_match: |
| 147 | + block = desc_match.group(1) |
| 148 | + else: |
| 149 | + # Fallback: take the first non-empty paragraph |
| 150 | + block = raw.split("\n\n")[0] |
| 151 | + |
| 152 | + # Collapse internal whitespace / newlines into a single line |
| 153 | + block = re.sub(r"\s+", " ", block).strip() |
| 154 | + |
| 155 | + # Replace teradataml-specific terminology |
| 156 | + block = block.replace("teradataml DataFrame", "table name") |
| 157 | + block = block.replace("DataFrame", "table name") |
| 158 | + |
| 159 | + # Truncate at the first sentence boundary (period followed by space or end) |
| 160 | + # Keep the trailing period. |
| 161 | + sent_match = re.search(r"^(.*?\.)\s", block) |
| 162 | + if sent_match: |
| 163 | + summary = sent_match.group(1) |
| 164 | + else: |
| 165 | + # No sentence boundary — use the whole block but cap length |
| 166 | + summary = block[:200].rstrip() |
| 167 | + if not summary.endswith("."): |
| 168 | + summary += "." |
| 169 | + |
| 170 | + return summary |
| 171 | + |
| 172 | + |
| 173 | +def main(): |
| 174 | + results: list[tuple[str, str]] = [] |
| 175 | + missing: list[str] = [] |
| 176 | + |
| 177 | + for name in FUNCS: |
| 178 | + summary = extract_summary(name) |
| 179 | + results.append((name, summary)) |
| 180 | + if "analytic function" in summary and name in summary: |
| 181 | + missing.append(name) |
| 182 | + |
| 183 | + # Print the dict literal ready to paste into constants.py |
| 184 | + print("TD_ANALYTIC_FUNCS = {") |
| 185 | + for name, summary in results: |
| 186 | + # Escape any quotes inside the summary |
| 187 | + safe = summary.replace('"', '\\"') |
| 188 | + print(f' "{name}": "{safe}",') |
| 189 | + print("}") |
| 190 | + |
| 191 | + if missing: |
| 192 | + print(f"\n# WARNING: {len(missing)} functions had no extractable docstring — fallback used:") |
| 193 | + for m in missing: |
| 194 | + print(f"# {m}") |
| 195 | + |
| 196 | + |
| 197 | +if __name__ == "__main__": |
| 198 | + main() |
0 commit comments