|
22 | 22 |
|
23 | 23 | # pytype: skip-file
|
24 | 24 |
|
25 |
| -import collections |
26 |
| -import copy |
27 | 25 | import logging
|
28 | 26 | import sys
|
29 | 27 | import threading
|
|
42 | 40 | from apache_beam.coders import coders
|
43 | 41 | from apache_beam.internal import util
|
44 | 42 | from apache_beam.options.value_provider import RuntimeValueProvider
|
45 |
| -from apache_beam.portability import common_urns |
46 |
| -from apache_beam.portability.api import beam_runner_api_pb2 |
47 | 43 | from apache_beam.pvalue import TaggedOutput
|
48 | 44 | from apache_beam.runners.sdf_utils import NoOpWatermarkEstimatorProvider
|
49 | 45 | from apache_beam.runners.sdf_utils import RestrictionTrackerView
|
|
53 | 49 | from apache_beam.runners.sdf_utils import ThreadsafeWatermarkEstimator
|
54 | 50 | from apache_beam.transforms import DoFn
|
55 | 51 | from apache_beam.transforms import core
|
56 |
| -from apache_beam.transforms import environments |
57 | 52 | from apache_beam.transforms import userstate
|
58 | 53 | from apache_beam.transforms.core import RestrictionProvider
|
59 | 54 | from apache_beam.transforms.core import WatermarkEstimatorProvider
|
60 | 55 | from apache_beam.transforms.window import GlobalWindow
|
61 | 56 | from apache_beam.transforms.window import GlobalWindows
|
62 | 57 | from apache_beam.transforms.window import TimestampedValue
|
63 | 58 | from apache_beam.transforms.window import WindowFn
|
64 |
| -from apache_beam.typehints import typehints |
65 | 59 | from apache_beam.typehints.batch import BatchConverter
|
66 | 60 | from apache_beam.utils.counters import Counter
|
67 | 61 | from apache_beam.utils.counters import CounterName
|
@@ -1920,159 +1914,3 @@ def windows(self):
|
1920 | 1914 | raise AttributeError('windows not accessible in this context')
|
1921 | 1915 | else:
|
1922 | 1916 | return self.windowed_value.windows
|
1923 |
| - |
1924 |
| - |
1925 |
| -def group_by_key_input_visitor(deterministic_key_coders=True): |
1926 |
| - # Importing here to avoid a circular dependency |
1927 |
| - # pylint: disable=wrong-import-order, wrong-import-position |
1928 |
| - from apache_beam.pipeline import PipelineVisitor |
1929 |
| - from apache_beam.transforms.core import GroupByKey |
1930 |
| - |
1931 |
| - class GroupByKeyInputVisitor(PipelineVisitor): |
1932 |
| - """A visitor that replaces `Any` element type for input `PCollection` of |
1933 |
| - a `GroupByKey` with a `KV` type. |
1934 |
| -
|
1935 |
| - TODO(BEAM-115): Once Python SDK is compatible with the new Runner API, |
1936 |
| - we could directly replace the coder instead of mutating the element type. |
1937 |
| - """ |
1938 |
| - def __init__(self, deterministic_key_coders=True): |
1939 |
| - self.deterministic_key_coders = deterministic_key_coders |
1940 |
| - |
1941 |
| - def enter_composite_transform(self, transform_node): |
1942 |
| - self.visit_transform(transform_node) |
1943 |
| - |
1944 |
| - def visit_transform(self, transform_node): |
1945 |
| - if isinstance(transform_node.transform, GroupByKey): |
1946 |
| - pcoll = transform_node.inputs[0] |
1947 |
| - pcoll.element_type = typehints.coerce_to_kv_type( |
1948 |
| - pcoll.element_type, transform_node.full_label) |
1949 |
| - pcoll.requires_deterministic_key_coder = ( |
1950 |
| - self.deterministic_key_coders and transform_node.full_label) |
1951 |
| - key_type, value_type = pcoll.element_type.tuple_types |
1952 |
| - if transform_node.outputs: |
1953 |
| - key = next(iter(transform_node.outputs.keys())) |
1954 |
| - transform_node.outputs[key].element_type = typehints.KV[ |
1955 |
| - key_type, typehints.Iterable[value_type]] |
1956 |
| - transform_node.outputs[key].requires_deterministic_key_coder = ( |
1957 |
| - self.deterministic_key_coders and transform_node.full_label) |
1958 |
| - |
1959 |
| - return GroupByKeyInputVisitor(deterministic_key_coders) |
1960 |
| - |
1961 |
| - |
1962 |
| -def validate_pipeline_graph(pipeline_proto): |
1963 |
| - """Ensures this is a correctly constructed Beam pipeline. |
1964 |
| - """ |
1965 |
| - def get_coder(pcoll_id): |
1966 |
| - return pipeline_proto.components.coders[ |
1967 |
| - pipeline_proto.components.pcollections[pcoll_id].coder_id] |
1968 |
| - |
1969 |
| - def validate_transform(transform_id): |
1970 |
| - transform_proto = pipeline_proto.components.transforms[transform_id] |
1971 |
| - |
1972 |
| - # Currently the only validation we perform is that GBK operations have |
1973 |
| - # their coders set properly. |
1974 |
| - if transform_proto.spec.urn == common_urns.primitives.GROUP_BY_KEY.urn: |
1975 |
| - if len(transform_proto.inputs) != 1: |
1976 |
| - raise ValueError("Unexpected number of inputs: %s" % transform_proto) |
1977 |
| - if len(transform_proto.outputs) != 1: |
1978 |
| - raise ValueError("Unexpected number of outputs: %s" % transform_proto) |
1979 |
| - input_coder = get_coder(next(iter(transform_proto.inputs.values()))) |
1980 |
| - output_coder = get_coder(next(iter(transform_proto.outputs.values()))) |
1981 |
| - if input_coder.spec.urn != common_urns.coders.KV.urn: |
1982 |
| - raise ValueError( |
1983 |
| - "Bad coder for input of %s: %s" % (transform_id, input_coder)) |
1984 |
| - if output_coder.spec.urn != common_urns.coders.KV.urn: |
1985 |
| - raise ValueError( |
1986 |
| - "Bad coder for output of %s: %s" % (transform_id, output_coder)) |
1987 |
| - output_values_coder = pipeline_proto.components.coders[ |
1988 |
| - output_coder.component_coder_ids[1]] |
1989 |
| - if (input_coder.component_coder_ids[0] != |
1990 |
| - output_coder.component_coder_ids[0] or |
1991 |
| - output_values_coder.spec.urn != common_urns.coders.ITERABLE.urn or |
1992 |
| - output_values_coder.component_coder_ids[0] != |
1993 |
| - input_coder.component_coder_ids[1]): |
1994 |
| - raise ValueError( |
1995 |
| - "Incompatible input coder %s and output coder %s for transform %s" % |
1996 |
| - (transform_id, input_coder, output_coder)) |
1997 |
| - elif transform_proto.spec.urn == common_urns.primitives.ASSIGN_WINDOWS.urn: |
1998 |
| - if not transform_proto.inputs: |
1999 |
| - raise ValueError("Missing input for transform: %s" % transform_proto) |
2000 |
| - elif transform_proto.spec.urn == common_urns.primitives.PAR_DO.urn: |
2001 |
| - if not transform_proto.inputs: |
2002 |
| - raise ValueError("Missing input for transform: %s" % transform_proto) |
2003 |
| - |
2004 |
| - for t in transform_proto.subtransforms: |
2005 |
| - validate_transform(t) |
2006 |
| - |
2007 |
| - for t in pipeline_proto.root_transform_ids: |
2008 |
| - validate_transform(t) |
2009 |
| - |
2010 |
| - |
2011 |
| -def merge_common_environments(pipeline_proto, inplace=False): |
2012 |
| - def dep_key(dep): |
2013 |
| - if dep.type_urn == common_urns.artifact_types.FILE.urn: |
2014 |
| - payload = beam_runner_api_pb2.ArtifactFilePayload.FromString( |
2015 |
| - dep.type_payload) |
2016 |
| - if payload.sha256: |
2017 |
| - type_info = 'sha256', payload.sha256 |
2018 |
| - else: |
2019 |
| - type_info = 'path', payload.path |
2020 |
| - elif dep.type_urn == common_urns.artifact_types.URL.urn: |
2021 |
| - payload = beam_runner_api_pb2.ArtifactUrlPayload.FromString( |
2022 |
| - dep.type_payload) |
2023 |
| - if payload.sha256: |
2024 |
| - type_info = 'sha256', payload.sha256 |
2025 |
| - else: |
2026 |
| - type_info = 'url', payload.url |
2027 |
| - else: |
2028 |
| - type_info = dep.type_urn, dep.type_payload |
2029 |
| - return type_info, dep.role_urn, dep.role_payload |
2030 |
| - |
2031 |
| - def base_env_key(env): |
2032 |
| - return ( |
2033 |
| - env.urn, |
2034 |
| - env.payload, |
2035 |
| - tuple(sorted(env.capabilities)), |
2036 |
| - tuple(sorted(env.resource_hints.items())), |
2037 |
| - tuple(sorted(dep_key(dep) for dep in env.dependencies))) |
2038 |
| - |
2039 |
| - def env_key(env): |
2040 |
| - return tuple( |
2041 |
| - sorted( |
2042 |
| - base_env_key(e) |
2043 |
| - for e in environments.expand_anyof_environments(env))) |
2044 |
| - |
2045 |
| - canonical_environments = collections.defaultdict(list) |
2046 |
| - for env_id, env in pipeline_proto.components.environments.items(): |
2047 |
| - canonical_environments[env_key(env)].append(env_id) |
2048 |
| - |
2049 |
| - if len(canonical_environments) == len(pipeline_proto.components.environments): |
2050 |
| - # All environments are already sufficiently distinct. |
2051 |
| - return pipeline_proto |
2052 |
| - |
2053 |
| - environment_remappings = { |
2054 |
| - e: es[0] |
2055 |
| - for es in canonical_environments.values() for e in es |
2056 |
| - } |
2057 |
| - |
2058 |
| - if not inplace: |
2059 |
| - pipeline_proto = copy.copy(pipeline_proto) |
2060 |
| - |
2061 |
| - for t in pipeline_proto.components.transforms.values(): |
2062 |
| - if t.environment_id not in pipeline_proto.components.environments: |
2063 |
| - # TODO(https://github.com/apache/beam/issues/30876): Remove this |
2064 |
| - # workaround. |
2065 |
| - continue |
2066 |
| - if t.environment_id: |
2067 |
| - t.environment_id = environment_remappings[t.environment_id] |
2068 |
| - for w in pipeline_proto.components.windowing_strategies.values(): |
2069 |
| - if w.environment_id not in pipeline_proto.components.environments: |
2070 |
| - # TODO(https://github.com/apache/beam/issues/30876): Remove this |
2071 |
| - # workaround. |
2072 |
| - continue |
2073 |
| - if w.environment_id: |
2074 |
| - w.environment_id = environment_remappings[w.environment_id] |
2075 |
| - for e in set(pipeline_proto.components.environments.keys()) - set( |
2076 |
| - environment_remappings.values()): |
2077 |
| - del pipeline_proto.components.environments[e] |
2078 |
| - return pipeline_proto |
0 commit comments