|
5 | 5 | import quapy as qp |
6 | 6 | import quapy.functional as F |
7 | 7 | from quapy.method import _bayesian |
8 | | -from quapy.method.aggregative import AggregativeCrispQuantifier |
9 | 8 | from quapy.data import LabelledCollection |
10 | | -from quapy.method.aggregative import AggregativeQuantifier |
| 9 | +from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier |
11 | 10 | from scipy.stats import chi2 |
12 | 11 | from sklearn.utils import resample |
13 | 12 | from abc import ABC, abstractmethod |
@@ -587,8 +586,113 @@ def aggregate(self, classif_predictions): |
587 | 586 | return np.asarray(samples.mean(axis=0), dtype=float) |
588 | 587 |
|
589 | 588 | def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): |
| 589 | + if confidence_level is None: |
| 590 | + confidence_level = self.confidence_level |
590 | 591 | classif_predictions = self.classify(instances) |
591 | 592 | point_estimate = self.aggregate(classif_predictions) |
592 | 593 | samples = self.get_prevalence_samples() # available after calling "aggregate" function |
593 | | - region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region) |
| 594 | + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) |
594 | 595 | return point_estimate, region |
| 596 | + |
| 597 | + |
| 598 | +class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): |
| 599 | + """ |
| 600 | + `Precise Quantifier: Bayesian distribution matching quantifier <https://arxiv.org/abs/2507.06061>, |
| 601 | + which is a variant of :class:`HDy` that calculates the posterior probability distribution |
| 602 | + over the prevalence vectors, rather than providing a point estimate. |
| 603 | +
|
| 604 | + This method relies on extra dependencies, which have to be installed via: |
| 605 | + `$ pip install quapy[bayes]` |
| 606 | +
|
| 607 | + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be |
| 608 | + the one indicated in `qp.environ['DEFAULT_CLS']` |
| 609 | + :param val_split: specifies the data used for generating classifier predictions. This specification |
| 610 | + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to |
| 611 | + be extracted from the training set; or as an integer (default 5), indicating that the predictions |
| 612 | + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value |
| 613 | + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to |
| 614 | + None when the method does not require any validation data, in order to avoid that some portion of |
| 615 | + the training data be wasted. |
| 616 | + :param num_warmup: number of warmup iterations for the STAN sampler (default 500) |
| 617 | + :param num_samples: number of samples to draw from the posterior (default 1000) |
| 618 | + :param stan_seed: random seed for the STAN sampler (default 0) |
| 619 | + :param region: string, set to `intervals` for constructing confidence intervals (default), or to |
| 620 | + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for |
| 621 | + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. |
| 622 | + """ |
| 623 | + def __init__(self, |
| 624 | + classifier: BaseEstimator=None, |
| 625 | + fit_classifier=True, |
| 626 | + val_split: int = 5, |
| 627 | + n_bins: int = 4, |
| 628 | + fixed_bins: bool = False, |
| 629 | + num_warmup: int = 500, |
| 630 | + num_samples: int = 1_000, |
| 631 | + stan_seed: int = 0, |
| 632 | + confidence_level: float = 0.95, |
| 633 | + region: str = 'intervals'): |
| 634 | + |
| 635 | + if num_warmup <= 0: |
| 636 | + raise ValueError(f'parameter {num_warmup=} must be a positive integer') |
| 637 | + if num_samples <= 0: |
| 638 | + raise ValueError(f'parameter {num_samples=} must be a positive integer') |
| 639 | + |
| 640 | + if not _bayesian.DEPENDENCIES_INSTALLED: |
| 641 | + raise ImportError("Auxiliary dependencies are required. " |
| 642 | + "Run `$ pip install quapy[bayes]` to install them.") |
| 643 | + |
| 644 | + super().__init__(classifier, fit_classifier, val_split) |
| 645 | + self.n_bins = n_bins |
| 646 | + self.fixed_bins = fixed_bins |
| 647 | + self.num_warmup = num_warmup |
| 648 | + self.num_samples = num_samples |
| 649 | + self.stan_seed = stan_seed |
| 650 | + self.stan_code = _bayesian.load_stan_file() |
| 651 | + self.confidence_level = confidence_level |
| 652 | + self.region = region |
| 653 | + |
| 654 | + def aggregation_fit(self, classif_predictions, labels): |
| 655 | + y_pred = classif_predictions[:, self.pos_label] |
| 656 | + |
| 657 | + # Compute bin limits |
| 658 | + if self.fixed_bins: |
| 659 | + # Uniform bins in [0,1] |
| 660 | + self.bin_limits = np.linspace(0, 1, self.n_bins + 1) |
| 661 | + else: |
| 662 | + # Quantile bins |
| 663 | + self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.n_bins + 1)) |
| 664 | + |
| 665 | + # Assign each prediction to a bin |
| 666 | + bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True) |
| 667 | + |
| 668 | + # Positive and negative masks |
| 669 | + pos_mask = (labels == self.pos_label) |
| 670 | + neg_mask = ~pos_mask |
| 671 | + |
| 672 | + # Count positives and negatives per bin |
| 673 | + self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.n_bins) |
| 674 | + self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.n_bins) |
| 675 | + |
| 676 | + def aggregate(self, classif_predictions): |
| 677 | + Px_test = classif_predictions[:, self.pos_label] |
| 678 | + test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) |
| 679 | + prevs = _bayesian.pq_stan( |
| 680 | + self.stan_code, self.n_bins, self.pos_hist, self.neg_hist, test_hist, |
| 681 | + self.num_samples, self.num_warmup, self.stan_seed |
| 682 | + ).flatten() |
| 683 | + self.prev_distribution = np.vstack([1-prevs, prevs]).T |
| 684 | + return self.prev_distribution.mean(axis=0) |
| 685 | + |
| 686 | + def aggregate_conf(self, predictions, confidence_level=None): |
| 687 | + if confidence_level is None: |
| 688 | + confidence_level = self.confidence_level |
| 689 | + point_estimate = self.aggregate(predictions) |
| 690 | + samples = self.prev_distribution |
| 691 | + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) |
| 692 | + return point_estimate, region |
| 693 | + |
| 694 | + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): |
| 695 | + predictions = self.classify(instances) |
| 696 | + return self.aggregate_conf(predictions, confidence_level=confidence_level) |
| 697 | + |
| 698 | + |
0 commit comments