|
| 1 | +[ |
| 2 | + { |
| 3 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 4 | + "lines": "46-46", |
| 5 | + "code": "spark.sparkContext.setLogLevel(\"WARN\")", |
| 6 | + "final_risk": 1.0, |
| 7 | + "root_cause": "Uses '.sparkContext' which is not supported in SCOS", |
| 8 | + "explanation": "RDD operations are not supported in SCOS.", |
| 9 | + "fix": "Convert to DataFrame operations. RDD operations are not supported in SCOS.", |
| 10 | + "confidence": "HIGH" |
| 11 | + }, |
| 12 | + { |
| 13 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 14 | + "lines": "50-50", |
| 15 | + "code": "companies = spark.read.parquet(os.path.join(DATA_DIR, \"companies.parquet\"))", |
| 16 | + "final_risk": 0.2, |
| 17 | + "root_cause": "We don't support partitioned write in local files. 4th argument i.e numPartitions in range function is a no-op in Snowpark Connect. ", |
| 18 | + "explanation": "Reading parquet files is supported in SCOS. The preliminary assessment notes a potential performance concern when reading from external paths rather than Snowflake stages, but this is not a compatibility failure.", |
| 19 | + "fix": "For better performance, consider uploading files to a Snowflake stage first using session.file.put().", |
| 20 | + "confidence": "HIGH" |
| 21 | + }, |
| 22 | + { |
| 23 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 24 | + "lines": "49-49", |
| 25 | + "code": "jobs = spark.read.parquet(os.path.join(DATA_DIR, \"jobs.parquet\"))", |
| 26 | + "final_risk": 0.2, |
| 27 | + "root_cause": "We don't support partitioned write in local files. 4th argument i.e numPartitions in range function is a no-op in Snowpark Connect. ", |
| 28 | + "explanation": "Reading parquet files is supported in SCOS. The warning is about potential performance differences when reading from external paths, not a compatibility failure.", |
| 29 | + "fix": "For better performance, consider uploading files to a Snowflake stage first using session.file.put().", |
| 30 | + "confidence": "HIGH" |
| 31 | + }, |
| 32 | + { |
| 33 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 34 | + "lines": "51-51", |
| 35 | + "code": "applications = spark.read.parquet(os.path.join(DATA_DIR, \"applications.parquet\"))", |
| 36 | + "final_risk": 0.2, |
| 37 | + "root_cause": "We don't support partitioned write in local files. 4th argument i.e numPartitions in range function is a no-op in Snowpark Connect. ", |
| 38 | + "explanation": "Reading parquet files is supported in SCOS. The warning relates to potential performance differences when reading from external paths, not a compatibility failure.", |
| 39 | + "fix": "For better performance, consider uploading files to a Snowflake stage first using session.file.put().", |
| 40 | + "confidence": "HIGH" |
| 41 | + }, |
| 42 | + { |
| 43 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 44 | + "lines": "100-104", |
| 45 | + "code": "final.select(\n \"job_id\", \"company_name\", \"industry\", \"title\", \"state\",\n \"salary_bucket\", \"salary_midpoint\", \"posted_month\",\n \"total_applications\", \"unique_applicants\", \"hires\",\n ).coalesce(1).write.mode(\"overwrite\").parquet(os.path.join(OUTPUT_DIR, \"job_analytics\"))", |
| 46 | + "final_risk": 0.2, |
| 47 | + "root_cause": "coalesce() is a no-op in SCOS - the code will run but may produce multiple output files instead of the intended single file", |
| 48 | + "explanation": "The coalesce(1) call is a no-op in SCOS, meaning the code will execute successfully but may not produce a single output file as intended. This is a behavioral difference rather than a failure.", |
| 49 | + "fix": "If single-file output is required, consider post-processing to merge files or use Snowflake-native methods for file consolidation. Otherwise, the code will work but with potentially multiple output files.", |
| 50 | + "confidence": "HIGH" |
| 51 | + }, |
| 52 | + { |
| 53 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 54 | + "lines": "57-59", |
| 55 | + "code": "jobs_deduped = jobs.withColumn(\"_rn\", F.row_number().over(w)) \\\n .filter(F.col(\"_rn\") == 1) \\\n .drop(\"_rn\")", |
| 56 | + "final_risk": 0.15, |
| 57 | + "root_cause": "Cannot filter using original DataFrame columns after transformation operations (drop, select, withColumn, etc.)", |
| 58 | + "explanation": "The code uses standard window functions and filtering on a newly created column. Unlike the similar test cases which fail when referencing original DataFrame columns after transformations, this code filters on '_rn' which is created in the same transformation chain.", |
| 59 | + "fix": null, |
| 60 | + "confidence": "MEDIUM" |
| 61 | + }, |
| 62 | + { |
| 63 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 64 | + "lines": "60-62", |
| 65 | + "code": "jobs_clean = jobs_deduped \\\n .filter(F.col(\"salary_min\").isNotNull()) \\\n .filter(F.col(\"salary_max\") > F.col(\"salary_min\"))", |
| 66 | + "final_risk": 0.1, |
| 67 | + "root_cause": "Cannot filter using original DataFrame columns after transformation operations (drop, select, withColumn, etc.)", |
| 68 | + "explanation": "The input code performs standard filtering on existing columns. The similar test cases fail due to filtering on columns after drop() operations, which doesn't apply here since we're filtering directly on the DataFrame's own columns.", |
| 69 | + "fix": null, |
| 70 | + "confidence": "HIGH" |
| 71 | + }, |
| 72 | + { |
| 73 | + "file": "/Users/pjain/git/coco-work/test_scos_migration/example/pyspark_transform.py", |
| 74 | + "lines": "76-80", |
| 75 | + "code": "app_stats = applications.groupBy(\"job_id\").agg(\n F.count(\"*\").alias(\"total_applications\"),\n F.countDistinct(\"applicant_id\").alias(\"unique_applicants\"),\n F.sum(F.when(F.col(\"status\") == \"hired\", 1).otherwise(0)).alias(\"hires\"),\n )", |
| 76 | + "final_risk": 0.1, |
| 77 | + "root_cause": "Ambiguous column reference in select after groupBy and agg on the same column", |
| 78 | + "explanation": "The input code uses proper aliasing for all aggregations, avoiding the ambiguous column reference issue from the similar test cases. The first/last non-determinism issue doesn't apply since those functions aren't used.", |
| 79 | + "fix": null, |
| 80 | + "confidence": "HIGH" |
| 81 | + } |
| 82 | +] |
0 commit comments