Skip to content

Metrics

Routines for metrics.

Metrics for data.

alpha_krippendorff(answers, distance=binary_distance)

Inter-annotator agreement coefficient (Krippendorff 1980).

Amount that annotators agreed on label assignments beyond what is expected by chance. The value of alpha should be interpreted as follows. alpha >= 0.8 indicates a reliable annotation, alpha >= 0.667 allows making tentative conclusions only, while the lower values suggest the unreliable annotation.

Parameters:

Name Type Description Default
answers DataFrame

A data frame containing task, worker and label columns.

required
distance Callable[[Hashable, Hashable], float]

Distance metric, that takes two arguments, and returns a value between 0.0 and 1.0 By default: binary_distance (0.0 for equal labels 1.0 otherwise).

binary_distance

Returns:

Type Description
float

Float value.

Examples:

Consistent answers.

>>> alpha_krippendorff(pd.DataFrame.from_records([
>>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'X', 'worker': 'B', 'label': 'Yes'},
>>>     {'task': 'Y', 'worker': 'A', 'label': 'No'},
>>>     {'task': 'Y', 'worker': 'B', 'label': 'No'},
>>> ]))
1.0

Partially inconsistent answers.

>>> alpha_krippendorff(pd.DataFrame.from_records([
>>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'X', 'worker': 'B', 'label': 'Yes'},
>>>     {'task': 'Y', 'worker': 'A', 'label': 'No'},
>>>     {'task': 'Y', 'worker': 'B', 'label': 'No'},
>>>     {'task': 'Z', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'Z', 'worker': 'B', 'label': 'No'},
>>> ]))
0.4444444444444444
Source code in crowdkit/metrics/data/_classification.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def alpha_krippendorff(
    answers: pd.DataFrame,
    distance: Callable[[Hashable, Hashable], float] = binary_distance,
) -> float:
    """Inter-annotator agreement coefficient (Krippendorff 1980).

    Amount that annotators agreed on label assignments beyond what is expected by chance.
    The value of alpha should be interpreted as follows.
        alpha >= 0.8 indicates a reliable annotation,
        alpha >= 0.667 allows making tentative conclusions only,
        while the lower values suggest the unreliable annotation.

    Args:
        answers: A data frame containing `task`, `worker` and `label` columns.
        distance: Distance metric, that takes two arguments,
            and returns a value between 0.0 and 1.0
            By default: binary_distance (0.0 for equal labels 1.0 otherwise).

    Returns:
        Float value.

    Examples:
        Consistent answers.

        >>> alpha_krippendorff(pd.DataFrame.from_records([
        >>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'X', 'worker': 'B', 'label': 'Yes'},
        >>>     {'task': 'Y', 'worker': 'A', 'label': 'No'},
        >>>     {'task': 'Y', 'worker': 'B', 'label': 'No'},
        >>> ]))
        1.0

        Partially inconsistent answers.

        >>> alpha_krippendorff(pd.DataFrame.from_records([
        >>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'X', 'worker': 'B', 'label': 'Yes'},
        >>>     {'task': 'Y', 'worker': 'A', 'label': 'No'},
        >>>     {'task': 'Y', 'worker': 'B', 'label': 'No'},
        >>>     {'task': 'Z', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'Z', 'worker': 'B', 'label': 'No'},
        >>> ]))
        0.4444444444444444
    """
    _check_answers(answers)
    data: List[Tuple[Any, Hashable, Hashable]] = answers[
        ["worker", "task", "label"]
    ].values.tolist()
    return float(AnnotationTask(data, distance).alpha())

consistency(answers, workers_skills=None, aggregator=MajorityVote(), by_task=False)

Consistency metric: posterior probability of aggregated label given workers skills calculated using the standard Dawid-Skene model.

Parameters:

Name Type Description Default
answers DataFrame

A data frame containing task, worker and label columns.

required
workers_skills Optional[Series]

workers skills e.g. golden set skills. If not provided, uses aggregator's workers_skills attribute.

None
aggregator BaseClassificationAggregator

aggregation method, default: MajorityVote

MajorityVote()
by_task bool

if set, returns consistencies for every task in provided data frame.

False

Returns:

Type Description
Union[float, Series[Any]]

Union[float, pd.Series]

Source code in crowdkit/metrics/data/_classification.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def consistency(
    answers: pd.DataFrame,
    workers_skills: Optional["pd.Series[Any]"] = None,
    aggregator: BaseClassificationAggregator = MajorityVote(),
    by_task: bool = False,
) -> Union[float, "pd.Series[Any]"]:
    """
    Consistency metric: posterior probability of aggregated label given workers skills
    calculated using the standard Dawid-Skene model.

    Args:
        answers (pandas.DataFrame): A data frame containing `task`, `worker` and `label` columns.
        workers_skills (Optional[pandas.Series]): workers skills e.g. golden set skills. If not provided,
            uses aggregator's `workers_skills` attribute.
        aggregator (aggregation.base.BaseClassificationAggregator): aggregation method, default: MajorityVote
        by_task (bool): if set, returns consistencies for every task in provided data frame.

    Returns:
        Union[float, pd.Series]
    """
    _check_answers(answers)
    aggregated = aggregator.fit_predict(answers)
    if workers_skills is None:
        if hasattr(aggregator, "skills_"):
            workers_skills = aggregator.skills_
        else:
            raise AssertionError(
                "This aggregator is not supported. Please, provide workers skills."
            )

    answers = answers.copy(deep=False)
    answers.set_index("task", inplace=True)
    answers = answers.reset_index().set_index("worker")
    answers["skill"] = workers_skills
    answers.reset_index(inplace=True)

    labels = pd.unique(answers.label)
    for label in labels:
        answers[label] = answers.apply(
            lambda row: _label_probability(row, label, len(labels)), axis=1
        )

    labels_proba = answers.groupby("task").prod(numeric_only=True)
    labels_proba["aggregated_label"] = aggregated
    labels_proba["denominator"] = labels_proba[list(labels)].sum(axis=1)

    consistencies = labels_proba.apply(_task_consistency, axis=1)

    if by_task:
        return consistencies
    else:
        return consistencies.mean()

uncertainty(answers, workers_skills=None, aggregator=None, compute_by='task', aggregate=True)

Label uncertainty metric: entropy of labels probability distribution. Computed as Shannon's Entropy with label probabilities computed either for tasks or workers: \(\(H(L) = -\sum_{label_i \in L} p(label_i) \cdot \log(p(label_i))\)\)

Parameters:

Name Type Description Default
answers DataFrame

A data frame containing task, worker and label columns.

required
workers_skills Optional[Series[Any]]

workers skills e.g. golden set skills. If not provided, but aggregator provided, uses aggregator's workers_skills attribute. Otherwise assumes equal skills for workers.

None
aggregator Optional[BaseClassificationAggregator]

aggregation method to obtain worker skills if not provided.

None
compute_by str

what to compute uncertainty for. If 'task', compute uncertainty of answers per task. If 'worker', compute uncertainty for each worker.

'task'
aggregate bool

If true, return the mean uncertainty, otherwise return uncertainties for each task or worker.

True

Returns:

Type Description
Union[float, Series[Any]]

Union[float, pd.Series]

Examples:

Mean task uncertainty minimal, as all answers to task are same.

>>> uncertainty(pd.DataFrame.from_records([
>>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'X', 'worker': 'B', 'label': 'Yes'},
>>> ]))
0.0

Mean task uncertainty maximal, as all answers to task are different.

>>> uncertainty(pd.DataFrame.from_records([
>>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'X', 'worker': 'B', 'label': 'No'},
>>>     {'task': 'X', 'worker': 'C', 'label': 'Maybe'},
>>> ]))
1.0986122886681096

Uncertainty by task without averaging.

>>> uncertainty(pd.DataFrame.from_records([
>>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'X', 'worker': 'B', 'label': 'No'},
>>>     {'task': 'Y', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'Y', 'worker': 'B', 'label': 'Yes'},
>>> ]),
>>> workers_skills=pd.Series([1, 1], index=['A', 'B']),
>>> compute_by="task", aggregate=False)
task
X    0.693147
Y    0.000000
dtype: float64

Uncertainty by worker

>>> uncertainty(pd.DataFrame.from_records([
>>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'X', 'worker': 'B', 'label': 'No'},
>>>     {'task': 'Y', 'worker': 'A', 'label': 'Yes'},
>>>     {'task': 'Y', 'worker': 'B', 'label': 'Yes'},
>>> ]),
>>> workers_skills=pd.Series([1, 1], index=['A', 'B']),
>>> compute_by="worker", aggregate=False)
worker
A    0.000000
B    0.693147
dtype: float64

Parameters:

Name Type Description Default
answers DataFrame

Workers' labeling results. A pandas.DataFrame containing task, worker and label columns.

required
workers_skills Optional[Series]

workers' skills. A pandas.Series index by workers and holding corresponding worker's skill

None
Source code in crowdkit/metrics/data/_classification.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def uncertainty(
    answers: pd.DataFrame,
    workers_skills: Optional["pd.Series[Any]"] = None,
    aggregator: Optional[BaseClassificationAggregator] = None,
    compute_by: str = "task",
    aggregate: bool = True,
) -> Union[float, "pd.Series[Any]"]:
    r"""Label uncertainty metric: entropy of labels probability distribution.
    Computed as Shannon's Entropy with label probabilities computed either for tasks or workers:
    $$H(L) = -\sum_{label_i \in L} p(label_i) \cdot \log(p(label_i))$$

    Args:
        answers: A data frame containing `task`, `worker` and `label` columns.
        workers_skills: workers skills e.g. golden set skills. If not provided,
            but aggregator provided, uses aggregator's `workers_skills` attribute.
            Otherwise assumes equal skills for workers.
        aggregator: aggregation method to obtain
            worker skills if not provided.
        compute_by: what to compute uncertainty for. If 'task', compute uncertainty of answers per task.
            If 'worker', compute uncertainty for each worker.
        aggregate: If true, return the mean uncertainty, otherwise return uncertainties for each task or worker.

    Returns:
        Union[float, pd.Series]

    Examples:
        Mean task uncertainty minimal, as all answers to task are same.

        >>> uncertainty(pd.DataFrame.from_records([
        >>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'X', 'worker': 'B', 'label': 'Yes'},
        >>> ]))
        0.0

        Mean task uncertainty maximal, as all answers to task are different.

        >>> uncertainty(pd.DataFrame.from_records([
        >>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'X', 'worker': 'B', 'label': 'No'},
        >>>     {'task': 'X', 'worker': 'C', 'label': 'Maybe'},
        >>> ]))
        1.0986122886681096

        Uncertainty by task without averaging.

        >>> uncertainty(pd.DataFrame.from_records([
        >>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'X', 'worker': 'B', 'label': 'No'},
        >>>     {'task': 'Y', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'Y', 'worker': 'B', 'label': 'Yes'},
        >>> ]),
        >>> workers_skills=pd.Series([1, 1], index=['A', 'B']),
        >>> compute_by="task", aggregate=False)
        task
        X    0.693147
        Y    0.000000
        dtype: float64

        Uncertainty by worker

        >>> uncertainty(pd.DataFrame.from_records([
        >>>     {'task': 'X', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'X', 'worker': 'B', 'label': 'No'},
        >>>     {'task': 'Y', 'worker': 'A', 'label': 'Yes'},
        >>>     {'task': 'Y', 'worker': 'B', 'label': 'Yes'},
        >>> ]),
        >>> workers_skills=pd.Series([1, 1], index=['A', 'B']),
        >>> compute_by="worker", aggregate=False)
        worker
        A    0.000000
        B    0.693147
        dtype: float64

    Args:
        answers (DataFrame): Workers' labeling results.
            A pandas.DataFrame containing `task`, `worker` and `label` columns.
        workers_skills (typing.Optional[pandas.core.series.Series]): workers' skills.
            A pandas.Series index by workers and holding corresponding worker's skill
    """
    _check_answers(answers)

    if workers_skills is None and aggregator is not None:
        aggregator.fit(answers)
        if hasattr(aggregator, "skills_"):
            workers_skills = aggregator.skills_
        else:
            raise AssertionError(
                "This aggregator is not supported. Please, provide workers skills."
            )

    answers = answers.copy(deep=False)
    answers = answers.set_index("worker")
    answers["skill"] = workers_skills if workers_skills is not None else 1
    if answers["skill"].isnull().any():
        missing_workers = set(answers[answers.skill.isnull()].index.tolist())
        raise AssertionError(
            f"Did not provide skills for workers: {missing_workers}."
            f"Please provide workers skills."
        )
    answers.reset_index(inplace=True)
    labels = pd.unique(answers.label)
    for label in labels:
        answers[label] = answers.apply(
            lambda row: _label_probability(row, label, len(labels)), axis=1
        )

    labels_proba = answers.groupby(compute_by).sum(numeric_only=True)
    uncertainties = labels_proba.apply(
        lambda row: entropy(row[labels] / (sum(row[labels]) + 1e-6)), axis=1
    )

    if aggregate:
        return cast(float, uncertainties.mean())

    return cast("pd.Series[Any]", uncertainties)

Metrics for workers.