@inproceedings{oshin-etal-2023-bateclacor,
title = "{B}a{TEC}la{C}or: A Novel Dataset for {B}angla Text Error Classification and Correction",
author = "Oshin, Nabilah and
Hoque, Syed and
Fahim, Md and
Ali, Amin Ahsan and
Amin, M Ashraful and
Rahman, Akmmahbubur",
editor = "Alam, Firoj and
Kar, Sudipta and
Chowdhury, Shammur Absar and
Sadeque, Farig and
Amin, Ruhul",
booktitle = "Proceedings of the First Workshop on Bangla Language Processing (BLP-2023)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.banglalp-1.14",
doi = "10.18653/v1/2023.banglalp-1.14",
pages = "124--135",
abstract = "In the context of the dynamic realm of Bangla communication, online users are often prone to bending the language or making errors due to various factors. We attempt to detect, categorize, and correct those errors by employing several machine learning and deep learning models. To contribute to the preservation and authenticity of the Bangla language, we introduce a meticulously categorized organic dataset encompassing 10,000 authentic Bangla comments from a commonly used social media platform. Through rigorous comparative analysis of distinct models, our study highlights BanglaBERT{'}s superiority in error-category classification and underscores the effectiveness of BanglaT5 for text correction. BanglaBERT achieves accuracy of 79.1{\%} and 74.1{\%} for binary and multiclass error-category classification while the BanglaBERT is fine-tuned and tested with our proposed dataset. Moreover, BanglaT5 achieves the best Rouge-L score (0.8459) when BanglaT5 is fine-tuned and tested with our corrected ground truths. Beyond algorithmic exploration, this endeavor represents a significant stride in enhancing the quality of digital discourse in the Bangla-speaking community, fostering linguistic precision and coherence in online interactions. The dataset and code is available at https://github.com/SyedT1/BaTEClaCor.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oshin-etal-2023-bateclacor">
<titleInfo>
<title>BaTEClaCor: A Novel Dataset for Bangla Text Error Classification and Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nabilah</namePart>
<namePart type="family">Oshin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syed</namePart>
<namePart type="family">Hoque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Fahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amin</namePart>
<namePart type="given">Ahsan</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">M</namePart>
<namePart type="given">Ashraful</namePart>
<namePart type="family">Amin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akmmahbubur</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Bangla Language Processing (BLP-2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Kar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farig</namePart>
<namePart type="family">Sadeque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruhul</namePart>
<namePart type="family">Amin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the context of the dynamic realm of Bangla communication, online users are often prone to bending the language or making errors due to various factors. We attempt to detect, categorize, and correct those errors by employing several machine learning and deep learning models. To contribute to the preservation and authenticity of the Bangla language, we introduce a meticulously categorized organic dataset encompassing 10,000 authentic Bangla comments from a commonly used social media platform. Through rigorous comparative analysis of distinct models, our study highlights BanglaBERT’s superiority in error-category classification and underscores the effectiveness of BanglaT5 for text correction. BanglaBERT achieves accuracy of 79.1% and 74.1% for binary and multiclass error-category classification while the BanglaBERT is fine-tuned and tested with our proposed dataset. Moreover, BanglaT5 achieves the best Rouge-L score (0.8459) when BanglaT5 is fine-tuned and tested with our corrected ground truths. Beyond algorithmic exploration, this endeavor represents a significant stride in enhancing the quality of digital discourse in the Bangla-speaking community, fostering linguistic precision and coherence in online interactions. The dataset and code is available at https://github.com/SyedT1/BaTEClaCor.</abstract>
<identifier type="citekey">oshin-etal-2023-bateclacor</identifier>
<identifier type="doi">10.18653/v1/2023.banglalp-1.14</identifier>
<location>
<url>https://aclanthology.org/2023.banglalp-1.14</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>124</start>
<end>135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BaTEClaCor: A Novel Dataset for Bangla Text Error Classification and Correction
%A Oshin, Nabilah
%A Hoque, Syed
%A Fahim, Md
%A Ali, Amin Ahsan
%A Amin, M. Ashraful
%A Rahman, Akmmahbubur
%Y Alam, Firoj
%Y Kar, Sudipta
%Y Chowdhury, Shammur Absar
%Y Sadeque, Farig
%Y Amin, Ruhul
%S Proceedings of the First Workshop on Bangla Language Processing (BLP-2023)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F oshin-etal-2023-bateclacor
%X In the context of the dynamic realm of Bangla communication, online users are often prone to bending the language or making errors due to various factors. We attempt to detect, categorize, and correct those errors by employing several machine learning and deep learning models. To contribute to the preservation and authenticity of the Bangla language, we introduce a meticulously categorized organic dataset encompassing 10,000 authentic Bangla comments from a commonly used social media platform. Through rigorous comparative analysis of distinct models, our study highlights BanglaBERT’s superiority in error-category classification and underscores the effectiveness of BanglaT5 for text correction. BanglaBERT achieves accuracy of 79.1% and 74.1% for binary and multiclass error-category classification while the BanglaBERT is fine-tuned and tested with our proposed dataset. Moreover, BanglaT5 achieves the best Rouge-L score (0.8459) when BanglaT5 is fine-tuned and tested with our corrected ground truths. Beyond algorithmic exploration, this endeavor represents a significant stride in enhancing the quality of digital discourse in the Bangla-speaking community, fostering linguistic precision and coherence in online interactions. The dataset and code is available at https://github.com/SyedT1/BaTEClaCor.
%R 10.18653/v1/2023.banglalp-1.14
%U https://aclanthology.org/2023.banglalp-1.14
%U https://doi.org/10.18653/v1/2023.banglalp-1.14
%P 124-135
Markdown (Informal)
[BaTEClaCor: A Novel Dataset for Bangla Text Error Classification and Correction](https://aclanthology.org/2023.banglalp-1.14) (Oshin et al., BanglaLP 2023)
ACL