@inproceedings{jonker-etal-2020-bag,
title = "Bag {\&} Tag{'}em - A New {D}utch Stemmer",
author = "Jonker, Anne and
de Ruijt, Corn{\'e} and
de Gruijl, Jornt",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.477",
pages = "3868--3876",
abstract = "We propose a novel stemming algorithm that is both robust and accurate compared to state-of-the-art solutions, yet addresses several of the problems that current stemmers face in the Dutch language. The main issue is that most current stemmers cannot handle 3rd person singular forms of verbs and many irregular words and conjugations, unless a (nearly) brute-force approach is used. Our algorithm combines a new tagging module with a stemmer that uses tag-specific sets of rigid rules: the Bag {\&} Tag{'}em (BT) algorithm. The tagging module is developed and evaluated using three algorithms: Multinomial Logistic Regression (MLR), Neural Network (NN) and Extreme Gradient Boosting (XGB). The stemming module{'}s performance is compared with that of current state-of-the-art stemming algorithms for the Dutch Language. Even though there is still room for improvement, the new BT algorithm performs well in the sense that it is more accurate than the current stemmers and faster than brute-force-like algorithms. The code and data used for this paper can be found at: \url{https://github.com/Anne-Jonker/Bag-Tag-em}.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jonker-etal-2020-bag">
<titleInfo>
<title>Bag & Tag’em - A New Dutch Stemmer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anne</namePart>
<namePart type="family">Jonker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Corné</namePart>
<namePart type="family">de Ruijt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jornt</namePart>
<namePart type="family">de Gruijl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>We propose a novel stemming algorithm that is both robust and accurate compared to state-of-the-art solutions, yet addresses several of the problems that current stemmers face in the Dutch language. The main issue is that most current stemmers cannot handle 3rd person singular forms of verbs and many irregular words and conjugations, unless a (nearly) brute-force approach is used. Our algorithm combines a new tagging module with a stemmer that uses tag-specific sets of rigid rules: the Bag & Tag’em (BT) algorithm. The tagging module is developed and evaluated using three algorithms: Multinomial Logistic Regression (MLR), Neural Network (NN) and Extreme Gradient Boosting (XGB). The stemming module’s performance is compared with that of current state-of-the-art stemming algorithms for the Dutch Language. Even though there is still room for improvement, the new BT algorithm performs well in the sense that it is more accurate than the current stemmers and faster than brute-force-like algorithms. The code and data used for this paper can be found at: https://github.com/Anne-Jonker/Bag-Tag-em.</abstract>
<identifier type="citekey">jonker-etal-2020-bag</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.477</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>3868</start>
<end>3876</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bag & Tag’em - A New Dutch Stemmer
%A Jonker, Anne
%A de Ruijt, Corné
%A de Gruijl, Jornt
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F jonker-etal-2020-bag
%X We propose a novel stemming algorithm that is both robust and accurate compared to state-of-the-art solutions, yet addresses several of the problems that current stemmers face in the Dutch language. The main issue is that most current stemmers cannot handle 3rd person singular forms of verbs and many irregular words and conjugations, unless a (nearly) brute-force approach is used. Our algorithm combines a new tagging module with a stemmer that uses tag-specific sets of rigid rules: the Bag & Tag’em (BT) algorithm. The tagging module is developed and evaluated using three algorithms: Multinomial Logistic Regression (MLR), Neural Network (NN) and Extreme Gradient Boosting (XGB). The stemming module’s performance is compared with that of current state-of-the-art stemming algorithms for the Dutch Language. Even though there is still room for improvement, the new BT algorithm performs well in the sense that it is more accurate than the current stemmers and faster than brute-force-like algorithms. The code and data used for this paper can be found at: https://github.com/Anne-Jonker/Bag-Tag-em.
%U https://aclanthology.org/2020.lrec-1.477
%P 3868-3876
Markdown (Informal)
[Bag & Tag’em - A New Dutch Stemmer](https://aclanthology.org/2020.lrec-1.477) (Jonker et al., LREC 2020)
ACL
- Anne Jonker, Corné de Ruijt, and Jornt de Gruijl. 2020. Bag & Tag’em - A New Dutch Stemmer. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 3868–3876, Marseille, France. European Language Resources Association.