ref-metrics

Build error

App Files Files Community

kevinconka commited on Jan 14

Commit

a4b3018

verified ·

1 Parent(s): 062331a

Update ref-metrics.py

Browse files

Files changed (1) hide show

ref-metrics.py +414 -96

ref-metrics.py CHANGED Viewed

@@ -11,59 +11,153 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import random
-import datetime
-import os
 import datasets
 import evaluate
-from seametrics.user_friendly.utils import calculate_from_payload
-import wandb
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
-}\
-@article{milan2016mot16,
-  title={MOT16: A benchmark for multi-object tracking},
-  author={Milan, Anton and Leal-Taix{\'e}, Laura and Reid, Ian and Roth, Stefan and Schindler, Konrad},
-  journal={arXiv preprint arXiv:1603.00831},
-  year={2016}
 }
 """
 _DESCRIPTION = """\
-The MOT Metrics module is designed to evaluate multi-object tracking (MOT)
-algorithms by computing various metrics based on predicted and ground truth bounding
-boxes. It serves as a crucial tool in assessing the performance of MOT systems,
-aiding in the iterative improvement of tracking algorithms."""
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
-    max_iou (`float`, *optional*):
-        If specified, this is the minimum Intersection over Union (IoU) threshold to consider a detection as a true positive.
-        Default is 0.5.
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class UserFriendlyMetrics(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
@@ -73,36 +167,265 @@ class UserFriendlyMetrics(evaluate.Metric):
             # This defines the format of each prediction and reference
             features=datasets.Features(
                 {
-                    "predictions": datasets.Sequence(
-                        datasets.Sequence(datasets.Value("float"))
-                    ),
-                    "references": datasets.Sequence(
-                        datasets.Sequence(datasets.Value("float"))
-                    ),
                 }
             ),
             # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"],
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def compute_from_payload(
-        self,
-        payload,
-        area_ranges_tuples=None,  # Optional parameter
     ):
         """
-        Compute the metric from the payload.
         Args:
             payload (Payload): The payload to compute the metric from.
             **kwargs: Additional keyword arguments.
         Returns:
             dict: The computed metric results with the following format:
             {
@@ -126,49 +449,44 @@ class UserFriendlyMetrics(evaluate.Metric):
         - If the metric does not support area ranges, the metric should store the results under the `all` key.
         - If a range area is provided it will be displayed in the output. if area_ranges_tuples is None, then all the area ranges will be displayed
         """
-        return self.dummy_values(area_ranges_tuples)
-    def dummy_values(self, area_ranges_tuples=None):
-        """Dummy randome values in the expected format that all new metrics need to return"""
-                # Use default ranges if none are provided
-        if area_ranges_tuples is None:
-            area_names = ["all", "small", "medium", "large"]
-        else:
-            area_names = {
-                key
-                for key, value in area_ranges_tuples.items()
-                if value["range"] is not None
-            }
-        # Generate random dummy values
-        def generate_random_values():
-            return {
-                "tp": random.randint(0, 100),  # Random integer between 0 and 100
-                "fp": random.randint(0, 50),  # Random integer between 0 and 50
-                "fn": random.randint(0, 50),  # Random integer between 0 and 50
-                "precision": round(
-                    random.uniform(0.5, 1.0), 2
-                ),  # Random float between 0.5 and 1.0
-                "recall": round(
-                    random.uniform(0.5, 1.0), 2
-                ),  # Random float between 0.5 and 1.0
-                "f1": round(
-                    random.uniform(0.5, 1.0), 2
-                ),  # Random float between 0.5 and 1.0
-            }
-        # Initialize output structure
-        dummy_output = {"model_1": {"overall": {}, "per_sequence": {"sequence_1": {},"sequence_2": {}}}}
-        # Populate only the ranges specified in area_ranges_tuples with random values
-        for area_name in area_names:
-            dummy_output["model_1"]["overall"][area_name] = generate_random_values()
-            dummy_output["model_1"]["per_sequence"]["sequence_1"][
-                area_name
-            ] = generate_random_values()
-            dummy_output["model_1"]["per_sequence"]["sequence_2"][
-                area_name
-            ] = generate_random_values()
-        return dummy_output

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""TODO: Add a description here."""
+from typing import List, Literal, Tuple
 import datasets
 import evaluate
+import numpy as np
+from deprecated import deprecated
+from seametrics.detection import PrecisionRecallF1Support
+from seametrics.detection.utils import payload_to_det_metric
+from seametrics.payload import Payload
 _CITATION = """\
+@InProceedings{coco:2020,
+title = {Microsoft {COCO:} Common Objects in Context},
+authors={Tsung{-}Yi Lin and
+                  Michael Maire and
+                  Serge J. Belongie and
+                  James Hays and
+                  Pietro Perona and
+                  Deva Ramanan and
+                  Piotr Dollar and
+                  C. Lawrence Zitnick},
+booktitle    = {Computer Vision - {ECCV} 2014 - 13th European Conference, Zurich,
+                Switzerland, September 6-12, 2014, Proceedings, Part {V}},
+series       = {Lecture Notes in Computer Science},
+volume       = {8693},
+pages        = {740--755},
+publisher    = {Springer},
+year={2014}
 }
 """
 _DESCRIPTION = """\
+This evaluation metric is designed to give provide object detection metrics at
+different object size levels. It is based on a modified version of the commonly used
+COCO-evaluation metrics.
+"""
 _KWARGS_DESCRIPTION = """
+Calculates object detection metrics given predicted and ground truth bounding boxes for
+a single image.
 Args:
+    predictions: list of predictions for each image. Each prediction should
+        be a dict containing the following
+        - 'boxes': list of bounding boxes, xywh in absolute pixel values
+        - 'labels': list of labels for each bounding box
+        - 'scores': list of scores for each bounding box
+    references: list of ground truth annotations for each image. Each reference should
+        be a dict containing the following
+        - 'boxes': list of bounding boxes, xywh in absolute pixel values
+        - 'labels': list of labels for each bounding box
+        - 'area': list of areas for each bounding box
+Returns:
+    dict containing dicts for each specified area range with following items:
+        'range': specified area with [max_px_area, max_px_area]
+        'iouThr': min. IOU-threshold of a prediction with a ground truth box
+            to be considered a correct prediction
+        'maxDets': maximum number of detections
+        'tp': number of true positive (correct) predictions
+        'fp': number of false positive (incorrect) predictions
+        'fn': number of false negative (missed) predictions
+        'duplicates': number of duplicate predictions
+        'precision': best possible score = 1, worst possible score = 0
+            large if few false positive predictions
+            formula: tp/(fp+tp)
+        'recall' best possible score = 1, worst possible score = 0
+            large if few missed predictions
+            formula: tp/(tp+fn)
+        'f1': best possible score = 1, worst possible score = 0
+            trades off precision and recall
+            formula: 2*(precision*recall)/(precision+recall)
+        'support': number of ground truth bounding boxes considered in the evaluation,
+        'fpi': number of images with no ground truth but false positive predictions,
+        'nImgs': number of images considered in evaluation
+Examples:
+    >>> import evaluate
+    >>> from seametrics.payload.processor import PayloadProcessor
+    >>> payload = PayloadProcessor(...).payload
+    >>> module = evaluate.load("SEA-AI/det-metrics", ...)
+    >>> module._add_payload(payload)
+    >>> result = module.compute()
+    >>> print(result)
+        {'all': {
+            'range': [0, 10000000000.0],
+            'iouThr': '0.00',
+            'maxDets': 100,
+            'tp': 1,
+            'fp': 3,
+            'fn': 1,
+            'duplicates': 0,
+            'precision': 0.25,
+            'recall': 0.5,
+            'f1': 0.3333333333333333,
+            'support': 2,
+            'fpi': 0,
+            'nImgs': 2
+            }
+        }
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class DetectionMetric(evaluate.Metric):
+    def __init__(
+        self,
+        area_ranges_tuples: List[Tuple[str, List[int]]] = [("all", [0, 1e5**2])],
+        iou_threshold: List[float] = [1e-10],
+        class_agnostic: bool = True,
+        bbox_format: str = "xywh",
+        iou_type: Literal["bbox", "segm"] = "bbox",
+        payload: Payload = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # save parameters for later
+        self.payload = payload
+        self.model_names = payload.models if payload else ["custom"]
+        self.iou_threshold = iou_threshold
+        self.area_ranges_tuples = area_ranges_tuples
+        self.class_agnostic = class_agnostic
+        self.iou_type = iou_type
+        self.bbox_format = bbox_format
+        # postprocess parameters
+        self.iou_thresholds = (
+            iou_threshold if isinstance(iou_threshold, list) else [iou_threshold]
+        )
+        self.area_ranges = [v for _, v in area_ranges_tuples]
+        self.area_ranges_labels = [k for k, _ in area_ranges_tuples]
+        # initialize coco_metrics
+        self.coco_metric = PrecisionRecallF1Support(
+            iou_thresholds=self.iou_thresholds,
+            area_ranges=self.area_ranges,
+            area_ranges_labels=self.area_ranges_labels,
+            class_agnostic=self.class_agnostic,
+            iou_type=self.iou_type,
+            box_format=self.bbox_format,
+        )
+        # initialize evaluation metric
+        self._init_evaluation_metric()
     def _info(self):
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
             # This defines the format of each prediction and reference
             features=datasets.Features(
                 {
+                    "predictions": [
+                        datasets.Features(
+                            {
+                                "boxes": datasets.Sequence(
+                                    datasets.Sequence(datasets.Value("float"))
+                                ),
+                                "labels": datasets.Sequence(datasets.Value("int64")),
+                                "scores": datasets.Sequence(datasets.Value("float")),
+                            }
+                        )
+                    ],
+                    "references": [
+                        datasets.Features(
+                            {
+                                "boxes": datasets.Sequence(
+                                    datasets.Sequence(datasets.Value("float"))
+                                ),
+                                "labels": datasets.Sequence(datasets.Value("int64")),
+                                "area": datasets.Sequence(datasets.Value("float")),
+                            }
+                        )
+                    ],
                 }
             ),
             # Additional links to the codebase or references
+            codebase_urls=[
+                "https://github.com/SEA-AI/seametrics/tree/main",
+                "https://lightning.ai/docs/torchmetrics/stable/detection/mean_average_precision.html",
+            ],
         )
+    def add(self, *, prediction, reference, **kwargs):
+        """Adds a batch of predictions and references to the metric"""
+        # in case the inputs are lists, convert them to numpy arrays
+        prediction = self._preprocess(prediction)
+        reference = self._preprocess(reference)
+        self.coco_metric.update(prediction, reference)
+    def _init_evaluation_metric(self, **kwargs):
+        """
+        Initializes the evaluation metric by generating sample data, preprocessing predictions and references,
+        and then adding the processed data to the metric using the super class method with additional keyword arguments.
+        Parameters:
+            **kwargs: Additional keyword arguments for the super class method.
+        Returns:
+            None
+        """
+        predictions, references = self._generate_sample_data()
+        predictions = self._preprocess(predictions)
+        references = self._preprocess(references)
+        # does not impact the metric, but is required for the interface x_x
+        super(evaluate.Metric, self).add(
+            prediction=self._postprocess(predictions),
+            references=self._postprocess(references),
+            **kwargs,
+        )
+    @deprecated(reason="Use `module._add_payload` instead")
+    def add_batch(self, payload: Payload, model_name: str = None):
+        """Takes as input a payload and adds the batch to the metric"""
+        self._add_payload(payload, model_name)
+    def _compute(self, *, predictions, references, **kwargs):
+        """Called within the evaluate.Metric.compute() method"""
+        results = {}
+        for model_name in self.model_names:
+            print(f"\n##### {model_name} #####")
+            # add payload if available (otherwise predictions and references must be added with add function)
+            if self.payload:
+                self._add_payload(self.payload, model_name)
+            results[model_name] = self.coco_metric.compute()
+            # reset coco_metrics for next model
+            self.coco_metric = PrecisionRecallF1Support(
+                iou_thresholds=self.iou_thresholds,
+                area_ranges=self.area_ranges,
+                area_ranges_labels=self.area_ranges_labels,
+                class_agnostic=self.class_agnostic,
+                iou_type=self.iou_type,
+                box_format=self.bbox_format,
+            )
+        return results
+    def _add_payload(self, payload: Payload, model_name: str = None):
+        """Converts the payload to the format expected by the metric"""
+        # import only if needed since fiftyone is not a direct dependency
+        predictions, references = payload_to_det_metric(payload, model_name)
+        self.add(prediction=predictions, reference=references)
+        return self
+    def _preprocess(self, list_of_dicts):
+        """Converts the lists to numpy arrays for type checking"""
+        return [self._lists_to_np(d) for d in list_of_dicts]
+    def _postprocess(self, list_of_dicts):
+        """Converts the numpy arrays to lists for type checking"""
+        return [self._np_to_lists(d) for d in list_of_dicts]
+    def _np_to_lists(self, d):
+        """datasets does not support numpy arrays for type checking"""
+        for k, v in d.items():
+            if isinstance(v, dict):
+                self._np_to_lists(v)
+            elif isinstance(v, np.ndarray):
+                d[k] = v.tolist()
+        return d
+    def _lists_to_np(self, d):
+        """datasets does not support numpy arrays for type checking"""
+        for k, v in d.items():
+            if isinstance(v, dict):
+                self._lists_to_np(v)
+            elif isinstance(v, list):
+                d[k] = np.array(v)
+        return d
+    def generate_confidence_curves(
+        self, results, confidence_config={"T": 0, "R": 0, "K": 0, "A": 0, "M": 0}
     ):
         """
+        Generate confidence curves based on results and confidence configuration.
+        Parameters:
+            results (dict): Results of the evaluation for different models.
+            confidence_config (dict): Configuration for confidence values. Defaults to {"T": 0, "R": 0, "K": 0, "A": 0, "M": 0}.
+                                      T: [1e-10] iou threshold
+                                      R: recall threshold (not used)
+                                      K: class index (class-agnostic mAP, so only 0)
+                                      A: 0=all, 1=small, 2=medium, 3=large, ... (depending on area ranges)
+                                      M: [100] maxDets default in precision_recall_f1_support
+        Returns:
+            fig (plotly.graph_objects.Figure): The plotly figure showing the confidence curves.
+        """
+        import plotly.graph_objects as go
+        from seametrics.detection.utils import get_confidence_metric_vals
+        # Create traces
+        fig = go.Figure()
+        metrics = ["precision", "recall", "f1"]
+        for model_name in self.model_names:
+            print(f"##### {model_name} #####")
+            plot_data = get_confidence_metric_vals(
+                cocoeval=results[model_name]["eval"],
+                T=confidence_config["T"],
+                R=confidence_config["R"],
+                K=confidence_config["K"],
+                A=confidence_config["A"],
+                M=confidence_config["M"],
+            )
+            for metric in metrics:
+                fig.add_trace(
+                    go.Scatter(
+                        x=plot_data["conf"],
+                        y=plot_data[metric],
+                        mode="lines",
+                        name=f"{model_name} {metric}",
+                        line=dict(dash=None if metric == "f1" else "dash"),
+                    )
+                )
+        fig.update_layout(
+            title="Metric vs Confidence",
+            hovermode="x unified",
+            xaxis_title="Confidence",
+            yaxis_title="Metric value",
+        )
+        return fig
+    def wandb(self, results , wandb_runs: list = None, wandb_section: str = None, wandb_project='detection_metrics'):
+        """
+        Logs metrics to Weights and Biases (wandb) for tracking and visualization.
+        This function logs the provided metrics to Weights and Biases (wandb), a platform for tracking machine learning experiments.
+        Each key in the `results` dictionary represents a separate run and the corresponding value contains the metrics for that run.
+        If a W&B run list is provided, the results of the runs will be added to the passed W&B runs. Otherwise new W&B runs will be created.
+        If a W&B section ist provided, the metrics will be logged in this section drop-down. Otherwise no extra W&B section is created
+        and the metrics are logged directly.
+        The function logs in to wandb using an API key obtained from the secret 'WANDB_API_KEY', initializes a run for
+        each key in `results` and logs the metrics.
+        Args:
+            results (dict): A dictionary where each key is a unique identifier for a run and each value is another dictionary
+                            containing the metrics to log. Example:
+                            {
+                                "run1": {"metrics": {"accuracy": 0.9, "loss": 0.1}},
+                                "run2": {"metrics": {"accuracy": 0.85, "loss": 0.15}}
+                            }
+            wandb_runs (list, optional): A list containing W&B runs where the results should be added
+                                         (e.g. the first item in results will be added to the first run in wandb_runs, etc.)
+            wandb_section (str, optional): A string to specify the W&B
+            wandb_project (str, optional): The name of the wandb project to which the runs will be logged. Defaults to 'detection_metrics'.
+        Environment Variables:
+            WANDB_API_KEY: The API key for authenticating with wandb.
+        Imports:
+            os: To retrieve environment variables.
+            wandb: To interact with the Weights and Biases platform.
+            datetime: To generate a timestamp for run names.
+        """
+        import os
+        import wandb
+        import datetime
+        current_datetime = datetime.datetime.now()
+        formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+        wandb.login(key=os.getenv('WANDB_API_KEY'))
+        if not wandb_runs is None:
+            assert len(wandb_runs) == len(results), "runs and results must have the same length"
+        for i, k in enumerate(results.keys()):
+            if wandb_runs is None:
+                run = wandb.init(project=wandb_project, name=f"{k}-{formatted_datetime}")
+            else:
+                run = wandb_runs[i]
+            run.log({f"{wandb_section}/{m}" : v for m, v in results[k]['metrics'].items()} if wandb_section is not None else results[k]['metrics'])
+            if wandb_runs is None:
+                run.finish()
+    def _generate_sample_data(self):
+        """
+        Generates dummy sample data for predictions and references used for initialization.
+        Returns:
+            Tuple[List[Dict[str, List[Union[float, int]]]], List[Dict[str, List[Union[float, int]]]]]:
+                - predictions (List[Dict[str, List[Union[float, int]]]]): A list of dictionaries representing the predictions. Each dictionary contains the following keys:
+                    - boxes (List[List[float]]): A list of bounding boxes in the format [x, y, w, h].
+                    - labels (List[int]): A list of labels.
+                    - scores (List[float]): A list of scores.
+                - references (List[Dict[str, List[Union[float, int]]]]): A list of dictionaries representing the references. Each dictionary contains the following keys:
+                    - boxes (List[List[float]]): A list of bounding boxes in the format [x, y, w, h].
+                    - labels (List[int]): A list of labels.
+                    - area (List[float]): A list of areas.
+        """
+        predictions = [
+            {"boxes": [[1.0, 2.0, 3.0, 4.0]], "labels": [0], "scores": [1.0]}
+        ]
+        references = [{"boxes": [[1.0, 2.0, 3.0, 4.0]], "labels": [0], "area": [1.0]}]
+        return predictions, references
+    def compute_from_payload(self, payload: Payload):
+        """
+        Compute the metric from the payload.
         Args:
             payload (Payload): The payload to compute the metric from.
             **kwargs: Additional keyword arguments.
         Returns:
             dict: The computed metric results with the following format:
             {
         - If the metric does not support area ranges, the metric should store the results under the `all` key.
         - If a range area is provided it will be displayed in the output. if area_ranges_tuples is None, then all the area ranges will be displayed
         """
+        results = {}
+        for model_name in payload.models:
+            results[model_name] = {"overall": {}, "per_sequence": {}}
+            # per-sequence loop
+            for seq_name, sequence in payload.sequences.items():
+                # create new payload only with specific sequence and model
+                sequence_payload = Payload(
+                    dataset=payload.dataset,
+                    gt_field_name=payload.gt_field_name,
+                    models=[model_name],
+                    sequences={seq_name: sequence}
+                )
+                module = DetectionMetric(
+                    area_ranges_tuples=self.area_ranges_tuples,
+                    iou_threshold=self.iou_threshold,
+                    class_agnostic=self.class_agnostic,
+                    bbox_format=self.bbox_format,
+                    iou_type=self.iou_type,
+                    payload=sequence_payload
+                )
+                results[model_name]["per_sequence"][seq_name] = module.compute()[model_name]["metrics"]
+            # overall per-model loop
+            model_payload = Payload(
+                    dataset=payload.dataset,
+                    gt_field_name=payload.gt_field_name,
+                    models=[model_name],
+                    sequences=payload.sequences
+                )
+            module = DetectionMetric(
+                area_ranges_tuples=self.area_ranges_tuples,
+                iou_threshold=self.iou_threshold,
+                class_agnostic=self.class_agnostic,
+                bbox_format=self.bbox_format,
+                iou_type=self.iou_type,
+                payload=model_payload
+            )
+            results[model_name]["overall"] = module.compute()[model_name]["metrics"]
+        return results