@inproceedings{khademi-2020-multimodal,
title = "Multimodal Neural Graph Memory Networks for Visual Question Answering",
author = "Khademi, Mahmoud",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.643",
doi = "10.18653/v1/2020.acl-main.643",
pages = "7177--7188",
abstract = "We introduce a new neural network architecture, Multimodal Neural Graph Memory Networks (MN-GMN), for visual question answering. The MN-GMN uses graph structure with different region features as node attributes and applies a recently proposed powerful graph neural network model, Graph Network (GN), to reason about objects and their interactions in an image. The input module of the MN-GMN generates a set of visual features plus a set of encoded region-grounded captions (RGCs) for the image. The RGCs capture object attributes and their relationships. Two GNs are constructed from the input module using the visual features and encoded RGCs. Each node of the GNs iteratively computes a question-guided contextualized representation of the visual/textual information assigned to it. Then, to combine the information from both GNs, the nodes write the updated representations to an external spatial memory. The final states of the memory cells are fed into an answer module to predict an answer. Experiments show MN-GMN rivals the state-of-the-art models on Visual7W, VQA-v2.0, and CLEVR datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khademi-2020-multimodal">
<titleInfo>
<title>Multimodal Neural Graph Memory Networks for Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mahmoud</namePart>
<namePart type="family">Khademi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Chai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce a new neural network architecture, Multimodal Neural Graph Memory Networks (MN-GMN), for visual question answering. The MN-GMN uses graph structure with different region features as node attributes and applies a recently proposed powerful graph neural network model, Graph Network (GN), to reason about objects and their interactions in an image. The input module of the MN-GMN generates a set of visual features plus a set of encoded region-grounded captions (RGCs) for the image. The RGCs capture object attributes and their relationships. Two GNs are constructed from the input module using the visual features and encoded RGCs. Each node of the GNs iteratively computes a question-guided contextualized representation of the visual/textual information assigned to it. Then, to combine the information from both GNs, the nodes write the updated representations to an external spatial memory. The final states of the memory cells are fed into an answer module to predict an answer. Experiments show MN-GMN rivals the state-of-the-art models on Visual7W, VQA-v2.0, and CLEVR datasets.</abstract>
<identifier type="citekey">khademi-2020-multimodal</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.643</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.643</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>7177</start>
<end>7188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Neural Graph Memory Networks for Visual Question Answering
%A Khademi, Mahmoud
%Y Jurafsky, Dan
%Y Chai, Joyce
%Y Schluter, Natalie
%Y Tetreault, Joel
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F khademi-2020-multimodal
%X We introduce a new neural network architecture, Multimodal Neural Graph Memory Networks (MN-GMN), for visual question answering. The MN-GMN uses graph structure with different region features as node attributes and applies a recently proposed powerful graph neural network model, Graph Network (GN), to reason about objects and their interactions in an image. The input module of the MN-GMN generates a set of visual features plus a set of encoded region-grounded captions (RGCs) for the image. The RGCs capture object attributes and their relationships. Two GNs are constructed from the input module using the visual features and encoded RGCs. Each node of the GNs iteratively computes a question-guided contextualized representation of the visual/textual information assigned to it. Then, to combine the information from both GNs, the nodes write the updated representations to an external spatial memory. The final states of the memory cells are fed into an answer module to predict an answer. Experiments show MN-GMN rivals the state-of-the-art models on Visual7W, VQA-v2.0, and CLEVR datasets.
%R 10.18653/v1/2020.acl-main.643
%U https://aclanthology.org/2020.acl-main.643
%U https://doi.org/10.18653/v1/2020.acl-main.643
%P 7177-7188
Markdown (Informal)
[Multimodal Neural Graph Memory Networks for Visual Question Answering](https://aclanthology.org/2020.acl-main.643) (Khademi, ACL 2020)
ACL