@inproceedings{kambhatla-etal-2023-quantifying,
title = "Quantifying Train-Evaluation Overlap with Nearest Neighbors",
author = "Kambhatla, Gauri and
Nguyen, Thuy and
Choi, Eunsol",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.183",
doi = "10.18653/v1/2023.findings-acl.183",
pages = "2905--2920",
abstract = "Characterizing benchmark datasets is crucial to interpreting model performance. In this work, we study train-evaluation overlap as a measure of an individual dataset{'}s adequacy to evaluate model generalization over a wide range of datasets. We quantify the overlap with a simple novel metric based on a nearest neighbors approach between the training and evaluation sets. We identify nearest training examples for each evaluation example by mapping instances with generic and task-specific embedding methods. Our study on eleven classification and extractive QA tasks reveals a wide range of train-evaluation overlap, and we show that the data collection method of the dataset and the difficulty of the task may play a role in the amount of overlap. Lastly, we use our nearest neighbor analysis to identify challenging or potentially mislabeled examples. Our analysis quantifies train-evaluation overlap, providing insights for constructing datasets to study generalization.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kambhatla-etal-2023-quantifying">
<titleInfo>
<title>Quantifying Train-Evaluation Overlap with Nearest Neighbors</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gauri</namePart>
<namePart type="family">Kambhatla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thuy</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunsol</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Characterizing benchmark datasets is crucial to interpreting model performance. In this work, we study train-evaluation overlap as a measure of an individual dataset’s adequacy to evaluate model generalization over a wide range of datasets. We quantify the overlap with a simple novel metric based on a nearest neighbors approach between the training and evaluation sets. We identify nearest training examples for each evaluation example by mapping instances with generic and task-specific embedding methods. Our study on eleven classification and extractive QA tasks reveals a wide range of train-evaluation overlap, and we show that the data collection method of the dataset and the difficulty of the task may play a role in the amount of overlap. Lastly, we use our nearest neighbor analysis to identify challenging or potentially mislabeled examples. Our analysis quantifies train-evaluation overlap, providing insights for constructing datasets to study generalization.</abstract>
<identifier type="citekey">kambhatla-etal-2023-quantifying</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.183</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.183</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>2905</start>
<end>2920</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quantifying Train-Evaluation Overlap with Nearest Neighbors
%A Kambhatla, Gauri
%A Nguyen, Thuy
%A Choi, Eunsol
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F kambhatla-etal-2023-quantifying
%X Characterizing benchmark datasets is crucial to interpreting model performance. In this work, we study train-evaluation overlap as a measure of an individual dataset’s adequacy to evaluate model generalization over a wide range of datasets. We quantify the overlap with a simple novel metric based on a nearest neighbors approach between the training and evaluation sets. We identify nearest training examples for each evaluation example by mapping instances with generic and task-specific embedding methods. Our study on eleven classification and extractive QA tasks reveals a wide range of train-evaluation overlap, and we show that the data collection method of the dataset and the difficulty of the task may play a role in the amount of overlap. Lastly, we use our nearest neighbor analysis to identify challenging or potentially mislabeled examples. Our analysis quantifies train-evaluation overlap, providing insights for constructing datasets to study generalization.
%R 10.18653/v1/2023.findings-acl.183
%U https://aclanthology.org/2023.findings-acl.183
%U https://doi.org/10.18653/v1/2023.findings-acl.183
%P 2905-2920
Markdown (Informal)
[Quantifying Train-Evaluation Overlap with Nearest Neighbors](https://aclanthology.org/2023.findings-acl.183) (Kambhatla et al., Findings 2023)
ACL