real_open-ended_standard.json

[{"team_members": "None", "standard": {"overall": 62.26, "perAnswerType": {"other": 52.32, "number": 36.02, "yes/no": 80.57}}, "team_name_order": 1, "submissionRound": 1, "team_name": "10707_vqa_cao_cmu", "ref": "", "method": "stack co-att"}, {"team_members": "None", "standard": {"overall": 65.92, "perAnswerType": {"other": 56.53, "number": 39.28, "yes/no": 83.68}}, "team_name_order": 2, "submissionRound": 2, "team_name": "AA", "ref": "", "method": "AA"}, {"team_members": "Xu, Huijuan and Saenko, Kate", "temp_team_members": "{Xu, Huijuan and Saenko, Kate},", "standard": {"overall": 58.24, "perAnswerType": {"other": 43.48, "number": 37.53, "yes/no": 80.8}}, "team_name_order": 3, "submissionRound": 6, "team_name": "AAA", "ref": "http://arxiv.org/pdf/1511.05234v2.pdf", "method": "Our Spatial Memory Network stores neuron activations from different spatial regions of the image in its memory, and uses the question to choose relevant regions for computing the answer based on attention mechanism."}, {"team_members": "None", "standard": {"overall": 65.92, "perAnswerType": {"other": 56.5, "number": 38.11, "yes/no": 84.02}}, "team_name_order": 4, "submissionRound": 3, "team_name": "ACM", "ref": "", "method": "NA"}, {"team_members": "Qi Wu (Australia Centre for Visual Technologies, University of Adelaide), Peng Wang (Australia Centre for Visual Technologies, University of Adelaide), Chunhua Shen (Australia Centre for Visual Technologies, University of Adelaide), Anthony Dick (Australia Centre for Visual Technologies, University of Adelaide), Anton van den Hengel (Australia Centre for Visual Technologies, University of Adelaide)", "standard": {"overall": 59.44, "perAnswerType": {"other": 45.83, "number": 37.12, "yes/no": 81.07}}, "team_name_order": 5, "submissionRound": 4, "team_name": "ACVT_Adelaide", "ref": "http://arxiv.org/abs/1511.06973", "method": "We propose a method for visual question answering which combines an internal representation of the content of an image with information extracted from a general knowledge base to answer a broad range of image-based questions. This allows more complex questions to be answered using the predominant neural network-based approach than has previously been possible."}, {"team_members": "", "standard": {"overall": 71.48, "perAnswerType": {"other": 63.3, "number": 45.63, "yes/no": 87.61}}, "team_name_order": 6, "submissionRound": 5, "team_name": "AlphaVQA", "ref": "", "method": "a"}, {"team_members": "Bolei Zhou (Facebook AI Research & MIT), Yuandong Tian (Facebook AI Research), Sainbayar Sukhbaatar (Facebook AI Research & NYU), Arthur Szlam (Facebook AI Research), Rob Fergus (Facebook AI Research).", "standard": {"overall": 55.89, "perAnswerType": {"other": 42.62, "number": 34.98, "yes/no": 76.76}}, "team_name_order": 7, "submissionRound": 1, "team_name": "Bolei", "ref": "http://visualqa.csail.mit.edu", "method": "A improved version of bag of words plus standard deep features."}, {"team_members": "Aaditya Prakash (Brandeis University)", "standard": {"overall": 62.88, "perAnswerType": {"other": 51.91, "number": 37.73, "yes/no": 82.11}}, "team_name_order": 8, "submissionRound": 1, "team_name": "Brandeis", "ref": "http://iamaaditya.github.io/research/vqa/", "method": "We propose a variant of highway network designed to achieve multi-modal learning like VQA. We alter the signal of 'carry' gate with a multiplicand learned from word embeddings of the question. A multi-layered highway MLP learns the memory required to associate the image features with word vectors and thus achieves implicit soft attention over learned parameters."}, {"team_members": "None", "standard": {"overall": 68.77, "perAnswerType": {"other": 59.88, "number": 42.94, "yes/no": 85.72}}, "team_name_order": 9, "submissionRound": 5, "team_name": "CFM-UESTC", "ref": "", "method": "answering recurrently and choosing answer by visual gate"}, {"team_members": "None", "standard": {"overall": 65.18, "perAnswerType": {"other": 55.89, "number": 37.8, "yes/no": 83.0}}, "team_name_order": 10, "submissionRound": 11, "team_name": "CMF+COATT", "ref": "", "method": "CMF+COATT"}, {"team_members": "None", "standard": {"overall": 59.33, "perAnswerType": {"other": 45.76, "number": 36.44, "yes/no": 81.04}}, "team_name_order": 11, "submissionRound": 1, "team_name": "CNNAtt", "ref": "", "method": "CNN features and Att features"}, {"team_members": "Gyu-tae Park* (Samsung Electronics, FCS Lab), YoungChul Sohn* (Samsung Electronics, FCS Lab), Kibeom Lee* (Samsung Electronics, FCS Lab), Jong-Ryul Lee* (Samsung Electronics, FCS Lab)", "standard": {"overall": 67.64, "perAnswerType": {"other": 59.06, "number": 41.09, "yes/no": 84.4}}, "team_name_order": 12, "submissionRound": 1, "team_name": "DLAIT (Samsung SWC)", "ref": "", "method": "Improved Memory Networks with Multimodal Attention"}, {"team_members": "", "standard": {"overall": 64.58, "perAnswerType": {"other": 55.16, "number": 39.15, "yes/no": 82.05}}, "team_name_order": 13, "submissionRound": 5, "team_name": "G", "ref": "", "method": "Show, Ask, Attend, and Answer"}, {"team_members": "None", "standard": {"overall": 64.44, "perAnswerType": {"other": 54.08, "number": 40.04, "yes/no": 82.76}}, "team_name_order": 14, "submissionRound": 5, "team_name": "HAIBIN", "ref": "", "method": "dual cross-guided attention with Bi-LSTM"}, {"team_members": "None", "standard": {"overall": 63.88, "perAnswerType": {"other": 53.33, "number": 38.34, "yes/no": 82.72}}, "team_name_order": 15, "submissionRound": 5, "team_name": "Haibin", "ref": "", "method": "dualcross attention"}, {"team_members": "None", "standard": {"overall": 59.62, "perAnswerType": {"other": 46.46, "number": 38.14, "yes/no": 80.5}}, "team_name_order": 16, "submissionRound": 2, "team_name": "KAIST_TCL", "ref": "", "method": "deeper residual attention"}, {"team_members": "Ilija Ilievski (Graduate School for Integrative Sciences and Engineering,, National University of Singapore), Shuicheng Yan (Department of Electrical & Computer Engineering,, National University of Singapore), Jiashi Feng (Department of Electrical & Computer Engineering,, National University of Singapore)", "standard": {"overall": 59.54, "perAnswerType": {"other": 46.1, "number": 35.67, "yes/no": 81.34}}, "team_name_order": 17, "submissionRound": 2, "team_name": "LV-NUS", "ref": "https://arxiv.org/abs/1604.01485", "method": "We propose a novel Focused Dynamic Attention (FDA) model to provide better aligned image content representation with proposed questions. Being aware of the key words in the question, FDA employs off-the-shelf object detector to identify important regions, and fuse the information from the regions and global features via an LSTM unit."}, {"team_members": "Kuniaki Saito (University of Tokyo), Andrew Shin (University of Tokyo), Yoshitaka Ushiku (University of Tokyo), Tatsuya Harada (University of Tokyo)", "standard": {"overall": 61.77, "perAnswerType": {"other": 49.75, "number": 37.56, "yes/no": 81.98}}, "team_name_order": 18, "submissionRound": 4, "team_name": "MIL-UT", "ref": "", "method": "Multimodal Dual-Network in which one network performs an addition of all input features to form a common embedding space, and the other performs multiplication. Inputs to each network consist of fc6 from VGG-19, and the uppermost fully-connected layer from Resnet-152 and Resnet-101. We implemented 19 such dual networks with varying dimensions, and averaged out."}, {"team_members": "Caiming Xiong, Stephen Merity, Richard Socher", "standard": {"overall": 60.36, "perAnswerType": {"other": 48.33, "number": 36.82, "yes/no": 80.43}}, "team_name_order": 19, "submissionRound": 2, "team_name": "MMCX", "ref": "", "method": "CNN+GRU Memory"}, {"team_members": "", "standard": {"overall": 65.81, "perAnswerType": {"other": 55.56, "number": 37.42, "yes/no": 85.01}}, "team_name_order": 20, "submissionRound": 4, "team_name": "Minh-ARTORG", "ref": "", "method": "WMLB"}, {"team_members": "Mujtaba hasan (Indian Institute of Technology, Delhi)", "standard": {"overall": 57.36, "perAnswerType": {"other": 42.24, "number": 36.92, "yes/no": 80.28}}, "team_name_order": 21, "submissionRound": 1, "team_name": "Mujtaba hasan", "ref": "", "method": "We use finetuned VGG_19 for image representation and a novel combination of deep LSTMs and GRUs for the text analysis and train a fully connected layer on top of that for final task. We use backpropagation for end to end training and testing purposes.The weights of proposed joint network are initialized with pretrained CNN and GRU."}, {"team_members": "None", "standard": {"overall": 56.65, "perAnswerType": {"other": 41.64, "number": 36.06, "yes/no": 79.47}}, "team_name_order": 22, "submissionRound": 3, "team_name": "NUDT\u4e01\u5146\u4e91DM\u8bfe\u7a0b", "ref": "aa", "method": "AA"}, {"team_members": "Hyeonseob Nam (Naver Labs), Jeonghee Kim (Naver Labs)", "standard": {"overall": 64.79, "perAnswerType": {"other": 54.62, "number": 38.7, "yes/no": 83.31}}, "team_name_order": 23, "submissionRound": 1, "team_name": "Naver Labs", "ref": "", "method": "Dual Attention Networks (DANs) apply an attention mechanism on both image regions and question words through multiple stages. DANs focus on specific words that are relevant to the answers or the regions to attend to. 152-layer Deep Residual Network is used to extract high-level image features."}, {"team_members": "Jim Lee", "standard": {"overall": 59.14, "perAnswerType": {"other": 45.41, "number": 36.66, "yes/no": 80.94}}, "team_name_order": 24, "submissionRound": 1, "team_name": "OMG", "ref": "", "method": "neural module network"}, {"team_members": "None", "standard": {"overall": 64.12, "perAnswerType": {"other": 53.37, "number": 38.02, "yes/no": 83.33}}, "team_name_order": 25, "submissionRound": 4, "team_name": "POSTECH", "ref": "", "method": "Training recurrent answering units with joint loss minimization (VQA +VisualGenome region description, ResNet101 are used)"}, {"team_members": "None", "standard": {"overall": 69.94, "perAnswerType": {"other": 65.78, "number": 41.34, "yes/no": 82.04}}, "team_name_order": 26, "submissionRound": 3, "team_name": "R-Lab", "ref": "", "method": "R-Lab"}, {"team_members": "None", "standard": {"overall": 56.61, "perAnswerType": {"other": 42.13, "number": 35.97, "yes/no": 78.82}}, "team_name_order": 27, "submissionRound": 1, "team_name": "RIT", "ref": "", "method": "We created a Bayesian QDA variant and combine it with a deep neural network."}, {"team_members": "None", "standard": {"overall": 67.87, "perAnswerType": {"other": 60.4, "number": 38.67, "yes/no": 84.02}}, "team_name_order": 28, "submissionRound": 3, "team_name": "ReasonNet", "ref": "", "method": "dsa"}, {"team_members": "Ruiyu Li", "standard": {"overall": 60.76, "perAnswerType": {"other": 47.77, "number": 36.81, "yes/no": 82.07}}, "team_name_order": 29, "submissionRound": 5, "team_name": "SHB_1026", "ref": "", "method": "A deep reasoning network for VQA with question representation update."}, {"team_members": "None", "standard": {"overall": 69.23, "perAnswerType": {"other": 64.63, "number": 42.52, "yes/no": 81.37}}, "team_name_order": 30, "submissionRound": 2, "team_name": "Tech-Ilin", "ref": "", "method": "Highorder"}, {"submissionRound": 2, "temp_team_members": "{Fukui, Akira and Park, Dong Huk and Yang, Daylen and Rohrbach, Anna and Darrell, Trevor and Rohrbach, Marcus},", "standard": {"overall": 66.47, "perAnswerType": {"other": 58.0, "number": 39.47, "yes/no": 83.24}}, "team_name_order": 31, "team_members": "Akira Fukui (UC Berkeley EECS, Sony Corp. Tokyo), Dong Huk Park (UC Berkeley EECS), Daylen Yang (UC Berkeley EECS), Anna Rohrbach (UC Berkeley EECS, Max Planck Institute for Informatics, Saarbrucken), Trevor Darrell (UC Berkeley EECS), Marcus Rohrbach (UC Berkeley EECS)", "team_name": "UC Berkeley &amp; Sony", "ref": "https://arxiv.org/abs/1606.01847", "method": "We propose utilizing Multimodal Compact Bilinear pooling (MCB) to efficiently and expressively combine multimodal features. We present an architecture which uses MCB twice, once for predicting attention over spatial features and again to combine the attended representation with the question representation."}, {"team_members": "Jacob Andreas (UC Berkeley), Marcus Rohrbach (UC Berkeley), Trevor Darrell (UC Berkeley), Dan Klein (UC Berkeley)", "standard": {"overall": 59.44, "perAnswerType": {"other": 45.81, "number": 37.48, "yes/no": 80.98}}, "team_name_order": 32, "submissionRound": 3, "team_name": "UC Berkeley (DNMN)", "ref": "https://github.com/jacobandreas/nmn2", "method": "Neural module network: question-answering network assembled dynamically from a collection of jointly-trained modules."}, {"submissionRound": 2, "temp_team_members": "{Andreas, Jacob and Rohrbach, Marcus and Darrell, Trevor and Klein, Dan},", "standard": {"overall": 58.66, "perAnswerType": {"other": 44.01, "number": 37.7, "yes/no": 81.16}}, "team_name_order": 33, "team_members": "Jacob Andreas (UC Berkeley), Marcus Rohrbach (UC Berkeley), Trevor Darrell (UC Berkeley), Dan Klein (UC Berkeley)", "team_name": "UC Berkeley (NMN)", "ref": "github.com/jacobandreas/nmn2", "method": "Neural module network: question-specific attentional neural network assembled from a collection of jointly-trained modules based on a fixed semantic parse."}, {"team_members": "Issey Masuda Mora (Universitat Polit\u00e8cnica de Catalunya (UPC)), Santiago Pascual de la Puente (Universitat Polit\u00e8cnica de Catalunya (UPC)), Xavier Gir\u00f3-i-Nieto (Universitat Polit\u00e8cnica de Catalunya (UPC))", "standard": {"overall": 53.62, "perAnswerType": {"other": 36.7, "number": 35.53, "yes/no": 78.05}}, "team_name_order": 34, "submissionRound": 3, "team_name": "UPC", "ref": "http://imatge-upc.github.io/vqa-2016-cvprw/", "method": "Visual features are extracted with a Kernelized CNN [Liu 2015] and projected into the same space as the embedding of the question, which is a sentence embedding. The model it is not a classifier upon the N most frequent answers but an encoder-decoder architecture. The decoder predicts the answers word by word (the presented model only outputs one word per answer but it can be extended)."}, {"team_members": "None", "standard": {"overall": 67.36, "perAnswerType": {"other": 58.35, "number": 39.79, "yes/no": 84.91}}, "team_name_order": 35, "submissionRound": 2, "team_name": "UPMC-LIP6", "ref": "", "method": "logs/bagging/17_03_17_11:10:38"}, {"team_members": "Marc Bola\u00f1os (Universitat de Barcelona / Computer Vision Center, Bellaterra),, \u00c1lvaro Peris (PRHLT Research Center, Universitat Polit\u00e8cnica de Val\u00e8ncia),, Petia Radeva  (Universitat de Barcelona / Computer Vision Center, Bellaterra),, Francisco Casacuberta (PRHLT Research Center, Universitat Polit\u00e8cnica de Val\u00e8ncia)", "standard": {"overall": 55.77, "perAnswerType": {"other": 40.27, "number": 36.33, "yes/no": 78.88}}, "team_name_order": 36, "submissionRound": 1, "team_name": "UPV_UB", "ref": "", "method": "Our method makes use of a Bidirectional LSTM network for processing the question and a kernelized CNN for processing the visual information. Features extracted by these models were provided to a single-layered MLP classifier over the most common 2000 answers."}, {"team_members": "None", "standard": {"overall": 65.44, "perAnswerType": {"other": 57.15, "number": 38.08, "yes/no": 82.08}}, "team_name_order": 37, "submissionRound": 2, "team_name": "UT", "ref": "", "method": "Final"}, {"submissionRound": 5, "temp_team_members": "{Lu, Jiasen and Yang, Jianwei and Batra, Dhruv and Parikh, Devi},", "standard": {"overall": 62.06, "perAnswerType": {"other": 51.95, "number": 38.22, "yes/no": 79.95}}, "team_name_order": 38, "team_members": "Jiasen Lu (Virginia Tech), Jianwei Yang (Virginia Tech), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "VTComputerVison", "ref": "https://arxiv.org/abs/1606.00061", "method": "We present a novel co-attention model for VQA that jointly reasons about image and question attention. And our model reasons about the question (and consequently the image via the co-attention mechanism) in a hierarchical fashion via a novel 1-dimensional convolution neural networks (CNN) model."}, {"team_members": "Jiasen Lu (Virginia Tech), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "standard": {"overall": 60.33, "perAnswerType": {"other": 47.87, "number": 38.14, "yes/no": 80.56}}, "team_name_order": 39, "submissionRound": 1, "team_name": "VT_CV_Jiasen", "ref": "", "method": "Stacked Visual question answering model with 1-layer alternating question attention."}, {"team_members": "None", "standard": {"overall": 50.13, "perAnswerType": {"other": 39.03, "number": 29.72, "yes/no": 68.31}}, "team_name_order": 40, "submissionRound": 7, "team_name": "ZJU_LZP", "ref": "", "method": "CNNQAT"}, {"team_members": "None", "standard": {"overall": 59.55, "perAnswerType": {"other": 48.27, "number": 36.99, "yes/no": 78.48}}, "team_name_order": 41, "submissionRound": 4, "team_name": "abhshkdz", "ref": "https://arxiv.org/abs/1511.02274", "method": "Implementation of Stacked Attention Network, Yang et al., CVPR16"}, {"team_members": "None", "standard": {"overall": 55.34, "perAnswerType": {"other": 40.27, "number": 35.3, "yes/no": 78.1}}, "team_name_order": 42, "submissionRound": 1, "team_name": "att", "ref": "", "method": "att"}, {"team_members": "yu long", "standard": {"overall": 56.44, "perAnswerType": {"other": 41.14, "number": 35.85, "yes/no": 79.61}}, "team_name_order": 43, "submissionRound": 3, "team_name": "bl", "ref": "N/A", "method": "LSTM_Q_I_.471"}, {"team_members": "None", "standard": {"overall": 68.12, "perAnswerType": {"other": 59.42, "number": 40.25, "yes/no": 85.38}}, "team_name_order": 44, "submissionRound": 6, "team_name": "cama_hdu", "ref": "", "method": "mfb ensemble"}, {"team_members": "None", "standard": {"overall": 61.14, "perAnswerType": {"other": 49.8, "number": 34.44, "yes/no": 81.21}}, "team_name_order": 45, "submissionRound": 5, "team_name": "charles", "ref": "", "method": "just a test"}, {"team_members": "None", "standard": {"overall": 58.63, "perAnswerType": {"other": 47.76, "number": 36.98, "yes/no": 76.85}}, "team_name_order": 46, "submissionRound": 1, "team_name": "delta", "ref": "", "method": "att"}, {"team_members": "None", "standard": {"overall": 61.3, "perAnswerType": {"other": 51.39, "number": 38.77, "yes/no": 78.63}}, "team_name_order": 47, "submissionRound": 6, "team_name": "dfyu_test", "ref": "", "method": "state of art model currently which achieved 0.533 accuracy on val2 set"}, {"team_members": "None", "standard": {"overall": 65.9, "perAnswerType": {"other": 56.67, "number": 37.45, "yes/no": 83.91}}, "team_name_order": 48, "submissionRound": 2, "team_name": "dynamic routing", "ref": "", "method": "pro 525"}, {"team_members": "", "standard": {"overall": 58.43, "perAnswerType": {"other": 46.32, "number": 36.27, "yes/no": 78.24}}, "team_name_order": 49, "submissionRound": 2, "team_name": "global_vision", "ref": "", "method": "Residual Net + Classification"}, {"team_members": "None", "standard": {"overall": 56.02, "perAnswerType": {"other": 42.61, "number": 34.28, "yes/no": 77.25}}, "team_name_order": 50, "submissionRound": 3, "team_name": "goncalo", "ref": "", "method": "stacked attention networks"}, {"team_members": "Kushal Kafle (Chester F. Carlson Center for Imaging Science, Rochester Institute of Technology), and, Christopher Kanan (Chester F. Carlson Center for Imaging Science, Rochester Institute of Technology)", "standard": {"overall": 61.69, "perAnswerType": {"other": 49.61, "number": 39.27, "yes/no": 81.53}}, "team_name_order": 51, "submissionRound": 5, "team_name": "klab", "ref": "http://www.kushalkafle.com/kafle2016.pdf", "method": "Observing that the type of answer can be predicted from question alone, we formulated a Bayesian framework to incorporate answer-type prediction into a VQA pipeline. The current result consists of an improved MLP model trained using data augmentation.  The probabilities produced by this MLP model are then combined with a residual attention mechanism to get the predicted answers."}, {"team_members": "None", "standard": {"overall": 64.22, "perAnswerType": {"other": 52.99, "number": 41.01, "yes/no": 83.25}}, "team_name_order": 52, "submissionRound": 2, "team_name": "msra-vrt", "ref": "", "method": "multi-level attention"}, {"team_members": "None", "standard": {"overall": 68.14, "perAnswerType": {"other": 59.27, "number": 40.99, "yes/no": 85.41}}, "team_name_order": 53, "submissionRound": 1, "team_name": "s", "ref": "", "method": "s"}, {"team_members": "None", "standard": {"overall": 58.85, "perAnswerType": {"other": 46.42, "number": 36.41, "yes/no": 79.11}}, "team_name_order": 54, "submissionRound": 1, "team_name": "san", "ref": "", "method": "san"}, {"team_members": "Jin-Hwa Kim, Kyoung Woon On, Jeonghee Kim, Jung-Woo Ha, Byoung-Tak Zhang", "standard": {"overall": 66.89, "perAnswerType": {"other": 57.79, "number": 39.07, "yes/no": 84.61}}, "team_name_order": 55, "submissionRound": 2, "team_name": "snubi-naverlabs", "ref": "https://goo.gl/4LFRqy", "method": "Multimodal Low-rank Bilinear Pooling"}, {"team_members": "", "standard": {"overall": 66.15, "perAnswerType": {"other": 56.55, "number": 41.27, "yes/no": 83.69}}, "team_name_order": 56, "submissionRound": 3, "team_name": "ustc_dev", "ref": "", "method": "joint visual and textual attention"}, {"team_members": "None", "standard": {"overall": 67.16, "perAnswerType": {"other": 57.91, "number": 40.02, "yes/no": 84.87}}, "team_name_order": 57, "submissionRound": 2, "team_name": "vqahhi_drau", "ref": "", "method": "FRCNN DRAU with MCB"}, {"submissionRound": 1, "temp_team_members": "{Jiasen Lu and Aishwarya Agrawal and Stanislaw Antol and Margaret Mitchell and C. Lawrence Zitnick and Dhruv Batra and Devi Parikh},", "standard": {"overall": 29.72, "perAnswerType": {"other": 1.26, "number": 0.43, "yes/no": 70.53}}, "team_name_order": 58, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-all_yes", "ref": "", "method": "&quot;yes&quot; (prior) is predicted as the answer for all questions"}, {"submissionRound": 2, "temp_team_members": "{Jiasen Lu and Aishwarya Agrawal and Stanislaw Antol and Margaret Mitchell and C. Lawrence Zitnick and Dhruv Batra and Devi Parikh},", "standard": {"overall": 58.16, "perAnswerType": {"other": 43.73, "number": 36.53, "yes/no": 80.56}}, "team_name_order": 59, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-deeperLSTM_NormlizeCNN", "ref": "", "method": "2-channel (image and question) model. Question channel (LSTM with 2 hidden layers) provides question representation and the image channel (activations from last hidden layer of VGGNet) provides image representation. The image features thus obtained are l2 normalized. Question and image features are pointwise multiplied and fed to fully connected layer to obtain softmax distribution over 1000 answers."}, {"submissionRound": 1, "temp_team_members": "{Jiasen Lu and Aishwarya Agrawal and Stanislaw Antol and Margaret Mitchell and C. Lawrence Zitnick and Dhruv Batra and Devi Parikh},", "standard": {"overall": 54.06, "perAnswerType": {"other": 36.8, "number": 35.55, "yes/no": 79.01}}, "team_name_order": 60, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-lstm_cnn", "ref": "", "method": "2-channel (image and question) model. Question channel (LSTM with 1 hidden layer) provides question representation and the image channel (activations from last hidden layer of VGGNet) provides image representation. Question and image features are pointwise multiplied and fed to fully connected layer to obtai softmax distribution over 1000 answers."}, {"team_members": "Aishwarya Agrawal (Virginia Tech), Jiasen Lu (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "standard": {"overall": 42.73, "perAnswerType": {"other": 22.0, "number": 24.31, "yes/no": 71.73}}, "team_name_order": 61, "submissionRound": 1, "team_name": "vqateam-nearest_neighbor", "ref": "", "method": "For every question in the test set, we find its k nearest neighbor questions in the training set using cosine similarity in Skip-Thought feature space. In this set of k questions and their associated images, we find the image which is most similar to the query image in using cosine similarity in fc7 feature space. The most common ground truth answer of this most similar image and question pair is the predicted answer for the query image and question pair."}, {"submissionRound": 1, "temp_team_members": "{Jiasen Lu and Aishwarya Agrawal and Stanislaw Antol and Margaret Mitchell and C. Lawrence Zitnick and Dhruv Batra and Devi Parikh},", "standard": {"overall": 37.55, "perAnswerType": {"other": 9.32, "number": 35.63, "yes/no": 71.17}}, "team_name_order": 62, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-prior_per_qtype", "ref": "", "method": "We pick the most common answer per question type as the predicted answer"}, {"team_members": "Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh", "temp_team_members": "{Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},", "standard": {"overall": 48.89, "perAnswerType": {"other": 26.99, "number": 34.94, "yes/no": 78.12}}, "team_name_order": 63, "submissionRound": 1, "team_name": "vqateam-q_lstm_alone", "ref": "http://visualqa.org/VQA_ICCV2015.pdf", "method": "lstm using question only (without image)"}, {"date": "2018-07-28"}]