@inproceedings{ashby-weir-2020-leveraging,
title = "Leveraging {HTML} in Free Text Web Named Entity Recognition",
author = "Ashby, Colin and
Weir, David",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.36",
doi = "10.18653/v1/2020.coling-main.36",
pages = "407--413",
abstract = "HTML tags are typically discarded in free text Named Entity Recognition from Web pages. We investigate whether these discarded tags might be used to improve NER performance. We compare Text+Tags sentences with their Text-Only equivalents, over five datasets, two free text segmentation granularities and two NER models. We find an increased F1 performance for Text+Tags of between 0.9{\%} and 13.2{\%} over all datasets, variants and models. This performance increase, over datasets of varying entity types, HTML density and construction quality, indicates our method is flexible and adaptable. These findings imply that a similar technique might be of use in other Web-aware NLP tasks, including the enrichment of deep language models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ashby-weir-2020-leveraging">
<titleInfo>
<title>Leveraging HTML in Free Text Web Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Ashby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Weir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>HTML tags are typically discarded in free text Named Entity Recognition from Web pages. We investigate whether these discarded tags might be used to improve NER performance. We compare Text+Tags sentences with their Text-Only equivalents, over five datasets, two free text segmentation granularities and two NER models. We find an increased F1 performance for Text+Tags of between 0.9% and 13.2% over all datasets, variants and models. This performance increase, over datasets of varying entity types, HTML density and construction quality, indicates our method is flexible and adaptable. These findings imply that a similar technique might be of use in other Web-aware NLP tasks, including the enrichment of deep language models.</abstract>
<identifier type="citekey">ashby-weir-2020-leveraging</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.36</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.36</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>407</start>
<end>413</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Leveraging HTML in Free Text Web Named Entity Recognition
%A Ashby, Colin
%A Weir, David
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F ashby-weir-2020-leveraging
%X HTML tags are typically discarded in free text Named Entity Recognition from Web pages. We investigate whether these discarded tags might be used to improve NER performance. We compare Text+Tags sentences with their Text-Only equivalents, over five datasets, two free text segmentation granularities and two NER models. We find an increased F1 performance for Text+Tags of between 0.9% and 13.2% over all datasets, variants and models. This performance increase, over datasets of varying entity types, HTML density and construction quality, indicates our method is flexible and adaptable. These findings imply that a similar technique might be of use in other Web-aware NLP tasks, including the enrichment of deep language models.
%R 10.18653/v1/2020.coling-main.36
%U https://aclanthology.org/2020.coling-main.36
%U https://doi.org/10.18653/v1/2020.coling-main.36
%P 407-413
Markdown (Informal)
[Leveraging HTML in Free Text Web Named Entity Recognition](https://aclanthology.org/2020.coling-main.36) (Ashby & Weir, COLING 2020)
ACL