@inproceedings{ceolin-zhang-2020-discriminating,
title = "Discriminating between standard {R}omanian and {M}oldavian tweets using filtered character ngrams",
author = "Ceolin, Andrea and
Zhang, Hong",
editor = {Zampieri, Marcos and
Nakov, Preslav and
Ljube{\v{s}}i{\'c}, Nikola and
Tiedemann, J{\"o}rg and
Scherrer, Yves},
booktitle = "Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics (ICCL)",
url = "https://aclanthology.org/2020.vardial-1.25",
pages = "265--272",
abstract = "We applied word unigram models, character ngram models, and CNNs to the task of distinguishing tweets of two related dialects of Romanian (standard Romanian and Moldavian) for the VarDial 2020 RDI shared task (Gaman et al. 2020). The main challenge of the task was to perform cross-genre text classification: specifically, the models must be trained using text from news articles, and be used to predict tweets. Our best model was a Naive Bayes model trained on character ngrams, with the most common ngrams filtered out. We also applied SVMs and CNNs, but while they yielded the best performance on an evaluation dataset of news article, their accuracy significantly dropped when they were used to predict tweets. Our best model reached an F1 score of 0.715 on the evaluation dataset of tweets, and 0.667 on the held-out test dataset. The model ended up in the third place in the shared task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ceolin-zhang-2020-discriminating">
<titleInfo>
<title>Discriminating between standard Romanian and Moldavian tweets using filtered character ngrams</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Ceolin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics (ICCL)</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We applied word unigram models, character ngram models, and CNNs to the task of distinguishing tweets of two related dialects of Romanian (standard Romanian and Moldavian) for the VarDial 2020 RDI shared task (Gaman et al. 2020). The main challenge of the task was to perform cross-genre text classification: specifically, the models must be trained using text from news articles, and be used to predict tweets. Our best model was a Naive Bayes model trained on character ngrams, with the most common ngrams filtered out. We also applied SVMs and CNNs, but while they yielded the best performance on an evaluation dataset of news article, their accuracy significantly dropped when they were used to predict tweets. Our best model reached an F1 score of 0.715 on the evaluation dataset of tweets, and 0.667 on the held-out test dataset. The model ended up in the third place in the shared task.</abstract>
<identifier type="citekey">ceolin-zhang-2020-discriminating</identifier>
<location>
<url>https://aclanthology.org/2020.vardial-1.25</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>265</start>
<end>272</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Discriminating between standard Romanian and Moldavian tweets using filtered character ngrams
%A Ceolin, Andrea
%A Zhang, Hong
%Y Zampieri, Marcos
%Y Nakov, Preslav
%Y Ljubešić, Nikola
%Y Tiedemann, Jörg
%Y Scherrer, Yves
%S Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2020
%8 December
%I International Committee on Computational Linguistics (ICCL)
%C Barcelona, Spain (Online)
%F ceolin-zhang-2020-discriminating
%X We applied word unigram models, character ngram models, and CNNs to the task of distinguishing tweets of two related dialects of Romanian (standard Romanian and Moldavian) for the VarDial 2020 RDI shared task (Gaman et al. 2020). The main challenge of the task was to perform cross-genre text classification: specifically, the models must be trained using text from news articles, and be used to predict tweets. Our best model was a Naive Bayes model trained on character ngrams, with the most common ngrams filtered out. We also applied SVMs and CNNs, but while they yielded the best performance on an evaluation dataset of news article, their accuracy significantly dropped when they were used to predict tweets. Our best model reached an F1 score of 0.715 on the evaluation dataset of tweets, and 0.667 on the held-out test dataset. The model ended up in the third place in the shared task.
%U https://aclanthology.org/2020.vardial-1.25
%P 265-272
Markdown (Informal)
[Discriminating between standard Romanian and Moldavian tweets using filtered character ngrams](https://aclanthology.org/2020.vardial-1.25) (Ceolin & Zhang, VarDial 2020)
ACL