Skip to content

Multi-Label

Routines for multi-label aggregation.

BinaryRelevance

Bases: BaseClassificationAggregator

Simple aggregation algorithm for multi-label classification.

Binary Relevance is a straightforward approach for multi-label classification aggregation: each label is treated as a class in binary classification problem and aggregated separately using aggregation algorithms for classification, e.g. Majority Vote or Dawid Skene.

{% note info %}

If this method is used for single-label classification, the output of the BinaryRelevance method may differ from the output of the basic aggregator used for its intended purpose, since each class generates a binary classification task, and therefore it is considered separately. For example, some objects may not have labels.

{% endnote %}

Examples:

>>> import pandas as pd
>>> from crowdkit.aggregation import BinaryRelevance, DawidSkene
>>> df = pd.DataFrame(
>>>     [
>>>         ['t1', 'w1', ['house', 'tree']],
>>>         ['t1', 'w2', ['house']],
>>>         ['t1', 'w3', ['house', 'tree', 'grass']],
>>>         ['t2', 'w1', ['car']],
>>>         ['t2', 'w2', ['car', 'human']],
>>>         ['t2', 'w3', ['train']]
>>>     ]
>>> )
>>> df.columns = ['task', 'worker', 'label']
>>> result = BinaryRelevance(DawidSkene(n_iter=10)).fit_predict(df)
Source code in crowdkit/aggregation/multilabel/binary_relevance.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
@attr.s
class BinaryRelevance(BaseClassificationAggregator):
    r"""Simple aggregation algorithm for multi-label classification.

    Binary Relevance is a straightforward approach for multi-label classification aggregation:
    each label is treated as a class in binary classification problem and aggregated separately using
    aggregation algorithms for classification, e.g. Majority Vote or Dawid Skene.

    {% note info %}

    If this method is used for single-label classification, the output of the BinaryRelevance method may differ
    from the output of the basic aggregator used for its intended purpose, since each class generates a binary
    classification task, and therefore it is considered separately. For example, some objects may not have labels.

    {% endnote %}

    Examples:
        >>> import pandas as pd
        >>> from crowdkit.aggregation import BinaryRelevance, DawidSkene
        >>> df = pd.DataFrame(
        >>>     [
        >>>         ['t1', 'w1', ['house', 'tree']],
        >>>         ['t1', 'w2', ['house']],
        >>>         ['t1', 'w3', ['house', 'tree', 'grass']],
        >>>         ['t2', 'w1', ['car']],
        >>>         ['t2', 'w2', ['car', 'human']],
        >>>         ['t2', 'w3', ['train']]
        >>>     ]
        >>> )
        >>> df.columns = ['task', 'worker', 'label']
        >>> result = BinaryRelevance(DawidSkene(n_iter=10)).fit_predict(df)
    """

    base_aggregator: BaseClassificationAggregator = attr.ib(default=MajorityVote())
    """Aggregator instance that will be used for each binary classification.
    All class parameters will be copied, except for the results of previous fit."""

    aggregators_: Dict[str, BaseClassificationAggregator] = dict()
    """Label aggregators matched to classes. A dictionary that matches aggregators to classes.
    The key is the class found in the source data, and the value is the aggregator used for this class.
    The set of keys is all the classes that are in the input data."""

    @base_aggregator.validator
    def _any_name_except_a_name_of_an_attribute(
        self, attribute: Any, value: Any
    ) -> None:
        assert issubclass(
            value.__class__, BaseClassificationAggregator
        ), "Aggregator argument should be a classification aggregator"

    def fit(self, data: pd.DataFrame) -> "BinaryRelevance":
        """Fit the aggregators.

        Args:
            data (DataFrame): Workers' labeling results.
                A pandas.DataFrame containing `task`, `worker` and `label` columns.
                'label' column should contain list of labels, e.g. ['tree', 'house', 'car']

        Returns:
            BinaryRelevance: self.
        """

        data = data[["task", "worker", "label"]]
        mlb = MultiLabelBinarizer()
        binarized_labels = mlb.fit_transform(data["label"])
        task_to_labels: Dict[Union[str, float], List[Union[str, float]]] = dict()

        for i, label in enumerate(mlb.classes_):
            single_label_df = data[["task", "worker"]]
            single_label_df["label"] = binarized_labels[:, i]

            label_aggregator = clone_aggregator(self.base_aggregator)
            label_aggregator.fit_predict(single_label_df)
            self.aggregators_[label] = label_aggregator
            if label_aggregator.labels_ is not None:  # for mypy correct work
                for task, label_value in label_aggregator.labels_.items():
                    if task not in task_to_labels:
                        task_to_labels[cast(Union[str, float], task)] = list()
                    if label_value:
                        task_to_labels[cast(Union[str, float], task)].append(label)
        if not task_to_labels:
            self.labels_ = pd.Series(task_to_labels, dtype=float)
        else:
            self.labels_ = pd.Series(task_to_labels)
        if len(self.labels_):
            self.labels_.index.name = "task"
        return self

    def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
        """Fit the model and return aggregated results.

        Args:
            data (DataFrame): Workers' labeling results.
                A pandas.DataFrame containing `task`, `worker` and `label` columns.

        Returns:
            Series: Tasks' labels.
                A pandas.Series indexed by `task` such that `labels.loc[task]`
                is a list with the task's aggregated labels.
        """
        self.fit(data)
        assert self.labels_ is not None, "no labels_ produced"
        return self.labels_

aggregators_: Dict[str, BaseClassificationAggregator] = dict() class-attribute instance-attribute

Label aggregators matched to classes. A dictionary that matches aggregators to classes. The key is the class found in the source data, and the value is the aggregator used for this class. The set of keys is all the classes that are in the input data.

base_aggregator: BaseClassificationAggregator = attr.ib(default=MajorityVote()) class-attribute instance-attribute

Aggregator instance that will be used for each binary classification. All class parameters will be copied, except for the results of previous fit.

fit(data)

Fit the aggregators.

Parameters:

Name Type Description Default
data DataFrame

Workers' labeling results. A pandas.DataFrame containing task, worker and label columns. 'label' column should contain list of labels, e.g. ['tree', 'house', 'car']

required

Returns:

Name Type Description
BinaryRelevance BinaryRelevance

self.

Source code in crowdkit/aggregation/multilabel/binary_relevance.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def fit(self, data: pd.DataFrame) -> "BinaryRelevance":
    """Fit the aggregators.

    Args:
        data (DataFrame): Workers' labeling results.
            A pandas.DataFrame containing `task`, `worker` and `label` columns.
            'label' column should contain list of labels, e.g. ['tree', 'house', 'car']

    Returns:
        BinaryRelevance: self.
    """

    data = data[["task", "worker", "label"]]
    mlb = MultiLabelBinarizer()
    binarized_labels = mlb.fit_transform(data["label"])
    task_to_labels: Dict[Union[str, float], List[Union[str, float]]] = dict()

    for i, label in enumerate(mlb.classes_):
        single_label_df = data[["task", "worker"]]
        single_label_df["label"] = binarized_labels[:, i]

        label_aggregator = clone_aggregator(self.base_aggregator)
        label_aggregator.fit_predict(single_label_df)
        self.aggregators_[label] = label_aggregator
        if label_aggregator.labels_ is not None:  # for mypy correct work
            for task, label_value in label_aggregator.labels_.items():
                if task not in task_to_labels:
                    task_to_labels[cast(Union[str, float], task)] = list()
                if label_value:
                    task_to_labels[cast(Union[str, float], task)].append(label)
    if not task_to_labels:
        self.labels_ = pd.Series(task_to_labels, dtype=float)
    else:
        self.labels_ = pd.Series(task_to_labels)
    if len(self.labels_):
        self.labels_.index.name = "task"
    return self

fit_predict(data)

Fit the model and return aggregated results.

Parameters:

Name Type Description Default
data DataFrame

Workers' labeling results. A pandas.DataFrame containing task, worker and label columns.

required

Returns:

Name Type Description
Series Series[Any]

Tasks' labels. A pandas.Series indexed by task such that labels.loc[task] is a list with the task's aggregated labels.

Source code in crowdkit/aggregation/multilabel/binary_relevance.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
    """Fit the model and return aggregated results.

    Args:
        data (DataFrame): Workers' labeling results.
            A pandas.DataFrame containing `task`, `worker` and `label` columns.

    Returns:
        Series: Tasks' labels.
            A pandas.Series indexed by `task` such that `labels.loc[task]`
            is a list with the task's aggregated labels.
    """
    self.fit(data)
    assert self.labels_ is not None, "no labels_ produced"
    return self.labels_