Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 76 additions & 64 deletions src/evidently/core/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,12 @@ class ColumnRole(Enum):
Example = "example"


@dataclasses.dataclass
class ColumnInfo:
class ColumnInfo(BaseModel):
type: ColumnType
role: ColumnRole = ColumnRole.Unset


@dataclasses.dataclass
class BinaryClassification:
class BinaryClassification(BaseModel):
name: str
target: str
prediction_labels: Optional[str]
Expand All @@ -77,36 +75,34 @@ def __init__(
target: Optional[str] = None,
prediction_labels: Optional[str] = None,
prediction_probas: Optional[str] = None,
pos_label: Optional[str] = None,
pos_label: Optional[Label] = None,
labels: Optional[Dict[Label, str]] = None,
):
self.name = name
if (
target is None
and prediction_labels is None
and prediction_probas is None
and pos_label is None
and labels is None
):
self.target = "target"
self.prediction_labels = None
self.prediction_probas = "prediction"
self.pos_label = 1
self.labels = None
return
target = "target"
prediction_probas = "prediction"
pos_label = 1
if target is None or (prediction_labels is None and prediction_probas is None):
raise ValueError(
"Invalid BinaryClassification configuration:" " target and one of (labels or probas) should be set"
"Invalid BinaryClassification configuration:" " target and one of (labels or probas) should be set",
)
self.target = target
self.prediction_labels = prediction_labels
self.prediction_probas = prediction_probas
self.pos_label = pos_label if pos_label is not None else 1
self.labels = labels
super().__init__(
name=name,
target=target,
prediction_labels=prediction_labels,
prediction_probas=prediction_probas,
pos_label=pos_label if pos_label is not None else 1,
labels=labels,
)


@dataclasses.dataclass
class MulticlassClassification:
class MulticlassClassification(BaseModel):
name: str = "default"
target: str = "target"
prediction_labels: Optional[str] = "prediction"
Expand All @@ -122,35 +118,37 @@ def __init__(
prediction_probas: Optional[List[str]] = None,
labels: Optional[Dict[Label, str]] = None,
):
self.name = name
if target is None and prediction_labels is None and prediction_probas is None and labels is None:
self.target = "target"
self.prediction_labels = "prediction"
self.prediction_probas = None
self.labels = None
return
if target is None or (prediction_labels is None and prediction_probas is None):
raise ValueError(
"Invalid MulticlassClassification configuration:" " target and one of (labels or probas) should be set"
"Invalid MulticlassClassification configuration:" " target and one of (labels or probas) should be set",
)
if target is None and prediction_labels is None and prediction_probas is None and labels is None:
super().__init__(
name=name,
target="target",
prediction_labels="prediction",
prediction_probas=None,
labels=None,
)
else:
super().__init__(
target=target,
prediction_labels=prediction_labels,
prediction_probas=prediction_probas,
labels=labels,
)
self.target = target
self.prediction_labels = prediction_labels
self.prediction_probas = prediction_probas
self.labels = labels


Classification = Union[BinaryClassification, MulticlassClassification]
Classification = Union[MulticlassClassification, BinaryClassification]


@dataclasses.dataclass
class Regression:
class Regression(BaseModel):
name: str = "default"
target: str = "target"
prediction: str = "prediction"


@dataclasses.dataclass
class Recsys:
class Recsys(BaseModel):
name: str = "default"
user_id: str = "user_id"
item_id: str = "item_id"
Expand All @@ -159,18 +157,15 @@ class Recsys:
recommendations_type: str = "score"


@dataclasses.dataclass
class Completion:
class Completion(BaseModel):
pass


@dataclasses.dataclass
class RAG:
class RAG(BaseModel):
pass


@dataclasses.dataclass
class LLMClassification:
class LLMClassification(BaseModel):
input: str
target: str
predictions: Optional[str] = None
Expand All @@ -194,7 +189,6 @@ def get_column_type(self, column_name: str) -> Optional[ColumnType]:

LLMDefinition = Union[Completion, RAG, LLMClassification]


DEFAULT_TRACE_LINK_COLUMN = "_evidently_trace_link"


Expand Down Expand Up @@ -252,20 +246,16 @@ def __init__(
datetime_columns=datetime_columns,
unknown_columns=unknown_columns,
list_columns=list_columns,
# classification=classification,
# regression=regression,
# llm=llm,
classification=classification,
regression=regression,
llm=llm,
numerical_descriptors=numerical_descriptors if numerical_descriptors is not None else [],
categorical_descriptors=categorical_descriptors if categorical_descriptors is not None else [],
test_descriptors=test_descriptors,
# ranking=ranking,
ranking=ranking,
service_columns=service_columns,
special_columns=special_columns if special_columns is not None else [],
)
self.classification = classification
self.regression = regression
self.llm = llm
self.ranking = ranking

def get_numerical_columns(self):
return (self.numerical_columns or []) + (self.numerical_descriptors or [])
Expand Down Expand Up @@ -392,10 +382,10 @@ def to_descriptor(self, descriptor: Optional["Descriptor"] = None) -> "Descripto
raise ValueError("Parent descriptor is required for test without column")
descriptor_columns = descriptor.list_output_columns()
if len(descriptor_columns) == 1:
column = descriptor_columns[0]
column = descriptor_columns[0][0]
else:
raise ValueError(
f"Column is required for test with multiple columns in parent descriptor: [{', '.join(descriptor_columns)}]"
f"Column is required for test with multiple columns in parent descriptor: [{', '.join(descriptor_columns)}]",
)
else:
column = self.column
Expand All @@ -421,7 +411,9 @@ def __init__(self, alias: str, tests: Optional[List[AnyDescriptorTest]] = None,

@abc.abstractmethod
def generate_data(
self, dataset: "Dataset", options: Options
self,
dataset: "Dataset",
options: Options,
) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
raise NotImplementedError()

Expand All @@ -432,7 +424,7 @@ def validate_input(self, data_definition: DataDefinition) -> None:
for column in input_columns:
if column not in all_columns:
raise ValueError(
f"Column '{column}' is not found in dataset. Available columns: [{', '.join(all_columns)}]"
f"Column '{column}' is not found in dataset. Available columns: [{', '.join(all_columns)}]",
)

def list_output_columns(self) -> List[str]: # todo: also types?
Expand Down Expand Up @@ -463,7 +455,11 @@ class ColumnTest(SingleInputDescriptor):
condition: ColumnCondition

def __init__(
self, column: str, condition: Union[ColumnCondition, GenericTest], alias: Optional[str] = None, **data: Any
self,
column: str,
condition: Union[ColumnCondition, GenericTest],
alias: Optional[str] = None,
**data: Any,
) -> None:
self.column = column
if isinstance(condition, dict):
Expand All @@ -475,7 +471,9 @@ def __init__(
super().__init__(alias=alias or descriptor_condition.get_default_alias(column), **data)

def generate_data(
self, dataset: "Dataset", options: Options
self,
dataset: "Dataset",
options: Options,
) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
data = dataset.column(self.column)
res = data.data.apply(self.condition.check)
Expand Down Expand Up @@ -554,7 +552,9 @@ def __init__(
super().__init__(alias=alias or "summary", **data)

def generate_data(
self, dataset: "Dataset", options: Options
self,
dataset: "Dataset",
options: Options,
) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
tests = dataset.data_definition.test_descriptors or []
if len(tests) == 0:
Expand Down Expand Up @@ -621,10 +621,20 @@ class FeatureDescriptor(Descriptor):
feature: GeneratedFeatures

def __init__(
self, feature: GeneratedFeatures, alias: Optional[str] = None, tests: Optional[List[AnyDescriptorTest]] = None
self,
feature: GeneratedFeatures,
alias: Optional[str] = None,
tests: Optional[List[AnyDescriptorTest]] = None,
):
# this is needed because we try to access it before super call
feature = feature if isinstance(feature, GeneratedFeatures) else parse_obj_as(GeneratedFeatures, feature) # type: ignore[type-abstract]
feature = (
feature
if isinstance(feature, GeneratedFeatures)
else parse_obj_as(
GeneratedFeatures,
feature,
)
) # type: ignore[type-abstract]
feature_columns = feature.list_columns()
super().__init__(feature=feature, alias=alias or f"{feature_columns[0].display_name}", tests=tests)

Expand All @@ -636,7 +646,9 @@ def get_dataset_column(self, column_name: str, values: pd.Series) -> DatasetColu
return dataset_column

def generate_data(
self, dataset: "Dataset", options: Options
self,
dataset: "Dataset",
options: Options,
) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
feature = self.feature.generate_features_renamed(
dataset.as_dataframe(),
Expand All @@ -648,8 +660,8 @@ def generate_data(
for col in self.feature.list_columns()
}

def list_output_columns(self) -> List[str]:
return [c.display_name for c in self.feature.list_columns()]
def list_output_columns(self) -> List[Tuple[str, ColumnType]]:
return [(c.display_name, self.feature.get_type(c.name)) for c in self.feature.list_columns()]


def _determine_descriptor_column_name(alias: str, columns: List[str]):
Expand Down
Empty file added tests/future/core/__init__.py
Empty file.
Loading
Loading