PaddlePaddle · dongZheX · Sep 27, 2022 · Sep 27, 2022
diff --git a/examples/pna/README.md b/examples/pna/README.md
@@ -0,0 +1,37 @@
+# Principal Neighbourhood Aggregation for Graph Nets (PNA)
+
+[Principal Neighbourhood Aggregation for Graph Nets \(PNA\)](https://arxiv.org/abs/2004.05718)  is a graph learning model combining multiple aggregators with degree-scalers.
+
+
+### Datasets
+
+We perform graph classification experiment to reproduce paper results on [OGB](https://ogb.stanford.edu/). 
+
+### Dependencies
+
+- paddlepaddle >= 2.2.0
+- pgl >= 2.2.4
+
+### How to run
+
+
+```
+python main.py --config config.yaml   # train on ogbg-molhiv
+python main.py --config config_pcba.yaml # train on ogbg-molpcba
+```
+
+
+### Important Hyperparameters
+
+- aggregators: a list of aggregators name. ("mean", "sum", "max", "min", "var", "std")
+- scalers: a list of scalers name. ("identity", "amplification", "attenuation", "linear", "inverse_linear")
+- tower: The number of towers.
+- divide_input: hether the input features should be split between towers or not.
+- pre_layers: the number of MLP layers behind aggregators.
+- post_layers: MLP layers after aggregator.
+
+### Experiment results （ROC-AUC）
+|   | GIN   | PNA(paper result) | PNA(ours)|
+|-------------|----------|------------|-----------------|
+|HIV    | 0.7778  | 0.7905   | 0.7929     | 
+|PCBA   | 0.2266   | 0.2838   | 0.2801      |
diff --git a/examples/pna/config.yaml b/examples/pna/config.yaml
@@ -0,0 +1,43 @@
+task_name: train.hiv
+dataset_name: ogbg-molhiv
+metrics: rocauc
+
+hidden_size: 128
+out_size: 128
+dropout: 0.3
+num_layers: 4
+batch_norm: True
+residual: True
+aggregators: ["mean","max","min", "std"]
+scalers: ["identity", "amplification", "attenuation"]
+in_feat_dropout: 0
+post_layers: 1
+pre_layers: 1
+towers: 1
+edge_feat: True
+optim: momentum
+
+seed: 41
+# data config
+num_class: 1
+
+# runconfig
+epochs: 200
+batch_size: 128
+lr: 0.01
+lr_reduce_factor: 0.5
+lr_schedule_patience: 20
+min_lr: 0.0001
+weight_decay: 0.000003
+num_workers: 4
+shuffle: True
+max_time: 48
+log_step: 100
+
+# logger
+stdout: True
+log_dir: ./logs
+log_filename: log.txt
+save_dir: ./checkpoints
+output_dir: ./outputs
+files2saved: ["*.yaml", "*.py", "./utils"]
diff --git a/examples/pna/config_pcba.yaml b/examples/pna/config_pcba.yaml
@@ -0,0 +1,42 @@
+task_name: train.pcba
+dataset_name: ogbg-molpcba
+metrics: ap
+
+hidden_size: 510
+out_size: 510
+dropout: 0.2
+num_layers: 4
+batch_norm: True
+residual: True
+aggregators: ["mean", "sum", "max"]
+scalers: ["identity"]
+in_feat_dropout: 0.0
+post_layers: 1
+pre_layers: 1
+towers: 5
+edge_feat: True
+seed: 41
+optim: adam
+# data config
+num_class: 128
+
+# runconfig
+epochs: 100
+batch_size: 512 
+lr: 0.0005
+lr_reduce_factor: 0.8
+lr_schedule_patience: 4
+min_lr: 0.00002
+weight_decay: 0.000003
+num_workers: 4
+shuffle: True
+max_time: 48
+log_step: 100
+
+# logger
+stdout: True
+log_dir: ./logs
+log_filename: log.txt
+save_dir: ./checkpoints
+output_dir: ./outputs
+files2saved: ["*.yaml", "*.py", "./utils"]
diff --git a/examples/pna/dataset.py b/examples/pna/dataset.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+import os
+import sys
+import json
+import numpy as np
+import glob
+import copy
+import time
+import argparse
+from collections import OrderedDict, namedtuple
+from scipy.sparse import csr_matrix
+import pgl
+import paddle
+from pgl.utils.data.dataset import Dataset, StreamDataset, HadoopDataset
+from pgl.utils.data import Dataloader
+from pgl.utils.logger import log
+
+from utils.config import prepare_config, make_dir
+from ogb.graphproppred import GraphPropPredDataset
+from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims
+
+
+class Subset(Dataset):
+    r"""
+    Subset of a dataset at specified indices.
+    Arguments:
+        dataset (Dataset): The whole Dataset
+        indices (sequence): Indices in the whole set selected for subset
+    """
+
+    def __init__(self, dataset, indices, mode='train'):
+        self.dataset = dataset
+        if paddle.distributed.get_world_size() == 1 or mode != "train":
+            self.indices = indices
+        else:
+            self.indices = indices[int(paddle.distributed.get_rank())::int(
+                paddle.distributed.get_world_size())]
+
+        self.mode = mode
+
+    def __getitem__(self, idx):
+        return self.dataset[self.indices[idx]]
+
+    def __len__(self):
+        return len(self.indices)
+
+
+class ShardedDataset(Dataset):
+    """
+    SharderDataset
+    """
+
+    def __init__(self, data, mode="train"):
+        if paddle.distributed.get_world_size() == 1 or mode != "train":
+            self.data = data
+        else:
+            self.data = data[int(paddle.distributed.get_rank())::int(
+                paddle.distributed.get_world_size())]
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class MolDataset(Dataset):
+    """
+    Transfer raw ogb dataset to pgl dataset
+    """
+
+    def __init__(self, config, raw_dataset, mode='train', transform=None):
+        self.config = config
+        self.raw_dataset = raw_dataset
+        self.mode = mode
+
+        log.info("preprocess graph data in %s" % self.__class__.__name__)
+        self.graph_list = []
+        self.label = []
+        for i in range(len(self.raw_dataset)):
+            # num_nodes, edge_index, node_feat, edge_feat, label
+            graph, label = self.raw_dataset[i]
+            num_nodes = graph['num_nodes']
+            node_feat = graph['node_feat'].copy()
+            edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
+            edge_feat = graph['edge_feat'].copy()
+            main_graph = pgl.Graph(
+                num_nodes=num_nodes,
+                edges=edges,
+                node_feat={'feat': node_feat},
+                edge_feat={'feat': edge_feat})
+            self.graph_list.append(main_graph)
+            self.label.append(label)
+
+    def __getitem__(self, idx):
+        return self.graph_list[idx], self.label[idx]
+
+    def __len__(self):
+        return len(self.graph_list)
+
+
+class CollateFn(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, batch_data):
+        graph_list = []
+        labels = []
+        for g, label in batch_data:
+            if g is None:
+                continue
+            graph_list.append(g)
+            labels.append(label)
+
+        labels = np.array(labels)
+        batch_valid = (labels == labels).astype("bool")
+        labels = np.nan_to_num(labels).astype("float32")
+
+        g = pgl.Graph.batch(graph_list)
+        return g, labels, batch_valid