@inproceedings{tran-etal-2020-introducing,
title = "Introducing a Large-Scale Dataset for {V}ietnamese {POS} Tagging on Conversational Texts",
author = "Tran, Oanh and
Pham, Tu and
Dang, Vu and
Nguyen, Bang",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.482",
pages = "3913--3921",
abstract = "This paper introduces a large-scale human-labeled dataset for the Vietnamese POS tagging task on conversational texts. To this end, wepropose a new tagging scheme (with 36 POS tags) consisting of exclusive tags for special phenomena of conversational words, developthe annotation guideline and manually annotate 16.310K sentences using this guideline. Based on this corpus, a series of state-of-the-art tagging methods has been conducted to estimate their performances. Experimental results showed that the Conditional Random Fields model using both automatically learnt features from deep neural networks and handcrafted features yielded the best performance. Thismodel achieved 93.36{\%} in the accuracy score which is 1.6{\%} and 2.7{\%} higher than the model using either handcrafted features orautomatically-learnt features, respectively. This result is also a little bit higher than the model of fine-tuning BERT by 0.94{\%} in theaccuracy score. The performance measured on each POS tag is also very high with {\textgreater}90{\%} in the F1 score for 20 POS tags and {\textgreater}80{\%}in the F1 score for 11 POS tags. This work provides the public dataset and preliminary results for follow-up research on this interesting direction.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tran-etal-2020-introducing">
<titleInfo>
<title>Introducing a Large-Scale Dataset for Vietnamese POS Tagging on Conversational Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oanh</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tu</namePart>
<namePart type="family">Pham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vu</namePart>
<namePart type="family">Dang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bang</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>This paper introduces a large-scale human-labeled dataset for the Vietnamese POS tagging task on conversational texts. To this end, wepropose a new tagging scheme (with 36 POS tags) consisting of exclusive tags for special phenomena of conversational words, developthe annotation guideline and manually annotate 16.310K sentences using this guideline. Based on this corpus, a series of state-of-the-art tagging methods has been conducted to estimate their performances. Experimental results showed that the Conditional Random Fields model using both automatically learnt features from deep neural networks and handcrafted features yielded the best performance. Thismodel achieved 93.36% in the accuracy score which is 1.6% and 2.7% higher than the model using either handcrafted features orautomatically-learnt features, respectively. This result is also a little bit higher than the model of fine-tuning BERT by 0.94% in theaccuracy score. The performance measured on each POS tag is also very high with \textgreater90% in the F1 score for 20 POS tags and \textgreater80%in the F1 score for 11 POS tags. This work provides the public dataset and preliminary results for follow-up research on this interesting direction.</abstract>
<identifier type="citekey">tran-etal-2020-introducing</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.482</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>3913</start>
<end>3921</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Introducing a Large-Scale Dataset for Vietnamese POS Tagging on Conversational Texts
%A Tran, Oanh
%A Pham, Tu
%A Dang, Vu
%A Nguyen, Bang
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F tran-etal-2020-introducing
%X This paper introduces a large-scale human-labeled dataset for the Vietnamese POS tagging task on conversational texts. To this end, wepropose a new tagging scheme (with 36 POS tags) consisting of exclusive tags for special phenomena of conversational words, developthe annotation guideline and manually annotate 16.310K sentences using this guideline. Based on this corpus, a series of state-of-the-art tagging methods has been conducted to estimate their performances. Experimental results showed that the Conditional Random Fields model using both automatically learnt features from deep neural networks and handcrafted features yielded the best performance. Thismodel achieved 93.36% in the accuracy score which is 1.6% and 2.7% higher than the model using either handcrafted features orautomatically-learnt features, respectively. This result is also a little bit higher than the model of fine-tuning BERT by 0.94% in theaccuracy score. The performance measured on each POS tag is also very high with \textgreater90% in the F1 score for 20 POS tags and \textgreater80%in the F1 score for 11 POS tags. This work provides the public dataset and preliminary results for follow-up research on this interesting direction.
%U https://aclanthology.org/2020.lrec-1.482
%P 3913-3921
Markdown (Informal)
[Introducing a Large-Scale Dataset for Vietnamese POS Tagging on Conversational Texts](https://aclanthology.org/2020.lrec-1.482) (Tran et al., LREC 2020)
ACL