From 5570242624e163e541716a993918b7d534de6369 Mon Sep 17 00:00:00 2001
From: "Valentin G." <vale.goelz@gmail.com>
Date: Mon, 18 Nov 2024 19:51:32 +0100
Subject: [PATCH 01/10] solve issue 9755 (fix typo) (#9790)

Fixed the typo in the description of NeighborLoader.
---
 torch_geometric/loader/neighbor_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_geometric/loader/neighbor_loader.py b/torch_geometric/loader/neighbor_loader.py
index 341f2f5a23b6..5814724f0c48 100644
--- a/torch_geometric/loader/neighbor_loader.py
+++ b/torch_geometric/loader/neighbor_loader.py
@@ -14,7 +14,7 @@ class NeighborLoader(NodeLoader):
     This loader allows for mini-batch training of GNNs on large-scale graphs
     where full-batch training is not feasible.
 
-    More specifically, :obj:`num_neighbors` denotes how much neighbors are
+    More specifically, :obj:`num_neighbors` denotes how many neighbors are
     sampled for each node in each iteration.
     :class:`~torch_geometric.loader.NeighborLoader` takes in this list of
     :obj:`num_neighbors` and iteratively samples :obj:`num_neighbors[i]` for

From e1a925b792631d4362e4dab1cc75d64f41848250 Mon Sep 17 00:00:00 2001
From: Junhao Shen <sjhxbug1024@gmail.com>
Date: Tue, 19 Nov 2024 14:18:30 -0600
Subject: [PATCH 02/10] add GLEM model, TAGDataset and example of GLEM (#9662)

reopened  #9591

Feature summary:

- Add GLEM as GNN & LLM Co-training model to PyG
- adapt GLEM's LM to AutoModelForSequenceClassification from
transformers
- Lora support
- LM/LLM support
- ogbn-products/ogbn-arxiv testing finished
- TAGDataset can be used as a wrapper class for any node classification
dataset in PyG with LM tokenizer and associate raw text
- external prediction as pseudo labels supported

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Rishi Puri <puririshi98@berkeley.edu>
Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
---
 CHANGELOG.md                            |   2 +
 examples/llm/README.md                  |   7 +-
 examples/llm/glem.py                    | 443 ++++++++++++++++++++++++
 torch_geometric/datasets/__init__.py    |   2 +
 torch_geometric/datasets/tag_dataset.py | 350 +++++++++++++++++++
 torch_geometric/nn/models/__init__.py   |   3 +-
 torch_geometric/nn/models/glem.py       | 384 ++++++++++++++++++++
 7 files changed, 1187 insertions(+), 4 deletions(-)
 create mode 100644 examples/llm/glem.py
 create mode 100644 torch_geometric/datasets/tag_dataset.py
 create mode 100644 torch_geometric/nn/models/glem.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 73a782026189..91da66973cef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `nn.models.GLEM` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
+- Added `TAGDataset` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
 - Added support for fast `Delaunay()` triangulation via the `torch_delaunay` package ([#9748](https://github.com/pyg-team/pytorch_geometric/pull/9748))
 - Added PyTorch 2.5 support ([#9779](https://github.com/pyg-team/pytorch_geometric/pull/9779), [#9779](https://github.com/pyg-team/pytorch_geometric/pull/9780))
 - Support 3D tetrahedral mesh elements of shape `[4, num_faces]` in the `FaceToEdge` transformation ([#9776](https://github.com/pyg-team/pytorch_geometric/pull/9776))
diff --git a/examples/llm/README.md b/examples/llm/README.md
index f1f01428d991..e0ac02d87f2e 100644
--- a/examples/llm/README.md
+++ b/examples/llm/README.md
@@ -1,5 +1,6 @@
 # Examples for Co-training LLMs and GNNs
 
-| Example                              | Description                                                                                                                                                 |
-| ------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`g_retriever.py`](./g_retriever.py) | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information |
+| Example                              | Description                                                                                                                                                                                    |
+| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`g_retriever.py`](./g_retriever.py) | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information                                    |
+| [`glem.py`](./glem.py)               | Example for [GLEM](https://arxiv.org/abs/2210.14709), a GNN+LLM co-training model via variational Expectation-Maximization (EM) framework on node classification tasks to achieve SOTA results |
diff --git a/examples/llm/glem.py b/examples/llm/glem.py
new file mode 100644
index 000000000000..ec76cef4c010
--- /dev/null
+++ b/examples/llm/glem.py
@@ -0,0 +1,443 @@
+"""This example run GLEM model using PyG.
+Original Paper: https://arxiv.org/abs/2210.14709
+“Learning on Large-scale Text-attributed Graphs via Variational Inference“.
+Requirements on top of basic PyG:
+`pip install ogb transformers peft tqdm`.
+GLEM is a data augmentation co-training strategy for LM and GNN, our
+implementation extended original implementation from LM to LLM and opt for LoRA
+from peft.
+
+``note::
+    use addtional trick, please add your external prediction by assigning
+    `ext_pred_path` and combine it into pretraining phase and node features
+"""
+
+import argparse
+import os
+import os.path as osp
+import time
+
+import torch
+from ogb.nodeproppred import Evaluator, PygNodePropPredDataset
+
+from torch_geometric import seed_everything
+from torch_geometric.data import download_google_url
+from torch_geometric.datasets import TAGDataset
+from torch_geometric.loader import DataLoader, NeighborLoader
+from torch_geometric.nn.models import GAT, GCN, GLEM, GraphSAGE
+
+
+def get_n_params(model):
+    pp = 0
+    for p in list(model.parameters()):
+        nn = 1
+        for s in list(p.size()):
+            nn = nn * s
+        pp += nn
+    return pp
+
+
+def main(args):
+    gpu = args.gpu
+    dataset_name = args.dataset
+    root = osp.join('data', 'ogb')
+    hf_model = args.hf_model
+    pl_ratio = args.pl_ratio
+    gnn_lr = args.gnn_lr
+    lm_lr = args.lm_lr
+    em_order = args.em_order
+    gnn_epochs = args.gnn_epochs
+    lm_epochs = args.lm_epochs
+    patience = args.patience
+    verbose = args.verbose
+    out_dir = args.out_dir
+    lm_batch_size = args.lm_batch_size
+    gnn_batch_size = args.gnn_batch_size
+    lm_use_lora = args.lm_use_lora
+    token_on_disk = args.token_on_disk
+    num_em_iters = args.num_em_iters
+    start_time = time.time()
+    train_without_ext_pred = args.train_without_ext_pred
+    ext_pred = None
+    pretrain_augmented = False
+    ext_pseudo_labels = None
+    device = torch.device(
+        f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu')
+    print(f'Running on: {torch.cuda.get_device_name({gpu})}')
+    torch.cuda.empty_cache()
+
+    if not train_without_ext_pred:
+        ext_pred_path = download_google_url(
+            id='15sO2m7BeW7C1Upmdw3Cx1JS__6nxTAzY',
+            folder='data/ogb/ogbn_products/ext_preds',
+            filename='giant_sagn_scr.pt', log=True)
+        ext_pred = torch.load(ext_pred_path, map_location=device)
+        ext_pseudo_labels = ext_pred.argmax(dim=-1)
+        pretrain_augmented = True
+
+    seed_everything(42)
+
+    dataset = PygNodePropPredDataset(f'ogbn-{dataset_name}', root=root)
+    split_idx = dataset.get_idx_split()
+    data = dataset.data
+
+    tag_dataset = TAGDataset(root, dataset, hf_model,
+                             token_on_disk=token_on_disk)
+    text_dataset = tag_dataset.to_text_dataset()
+    print(tag_dataset.num_classes, tag_dataset.raw_file_names)
+
+    num_classes = tag_dataset.num_classes
+    num_features = data.num_features
+    # =========================== LM Data split ===============================
+    split_idx = tag_dataset.get_idx_split()
+
+    # GLEM train with augmented data, mark original train data as gold data,
+    gold_idx = split_idx['train']
+    split_idx['valid']
+    test_idx = split_idx['test']
+
+    # randome sample pseudo labels nodes, generate their index
+    num_pseudo_labels = int(gold_idx.numel() * pl_ratio)
+    idx_to_select = torch.randperm(test_idx.numel())[:num_pseudo_labels]
+    pseudo_labels_idx = test_idx[idx_to_select]
+    train_idx = torch.cat(
+        (gold_idx, pseudo_labels_idx))  # augmented train_indx
+
+    print(f'train_idx: {train_idx.size(0)}, '
+          f'gold_idx: {gold_idx.size(0)}, '
+          f'pseudo labels ratio: {pl_ratio}, '
+          f'{train_idx.size(0)/gold_idx.size(0) - 1.0}')
+    gold_dataset = torch.utils.data.Subset(dataset=text_dataset,
+                                           indices=gold_idx)
+    train_dataset = torch.utils.data.Subset(dataset=text_dataset,
+                                            indices=train_idx)
+    # ========================== LM Data Loader ===============================
+
+    print('Building language model dataloader...', end='-->')
+
+    # if set train_without_ext_pred == True, use this for pretrain
+    text_pretrain_loader = DataLoader(gold_dataset, batch_size=lm_batch_size,
+                                      drop_last=False, pin_memory=True,
+                                      shuffle=True)
+    # training with augmented data,
+    text_train_loader = DataLoader(train_dataset, batch_size=lm_batch_size,
+                                   drop_last=False, pin_memory=True,
+                                   shuffle=True)
+    text_test_loader = DataLoader(text_dataset, batch_size=lm_batch_size * 4,
+                                  drop_last=False, pin_memory=True,
+                                  shuffle=False)
+    print('done')
+
+    # =========================== GNN Data Loader =============================
+    initial_memory = torch.cuda.memory_allocated()
+    data = data.to(device)
+    if ext_pred is not None:
+        data.x = torch.cat((data.x, ext_pred), dim=1)
+        num_features += ext_pred.size(1)
+    current_memory_1 = torch.cuda.max_memory_allocated()
+    # 1 GB = 1073741824 Byte
+    gpu_usage = float(current_memory_1 - initial_memory) / 1073741824
+    # Print the maximum memory usage after running the model
+    print(f'GPU memory usage -- data to gpu: {gpu_usage:.2f} GB')
+
+    print('build GNN dataloader(GraphSAGE NeighborLoader)', end='-->')
+
+    # train on gold data w/o pseudo labels
+    graph_pretrain_loader = NeighborLoader(
+        data,
+        input_nodes=gold_idx,
+        num_neighbors=[15, 10, 5],
+        batch_size=gnn_batch_size,
+        shuffle=True,
+        num_workers=12,
+        persistent_workers=True,
+    )
+
+    # graph data loader w/ pseudo labels in M-step
+    graph_train_loader = NeighborLoader(
+        data,
+        input_nodes=train_idx,
+        num_neighbors=[15, 10, 5],
+        batch_size=gnn_batch_size,
+        shuffle=True,
+        num_workers=12,
+        persistent_workers=True,
+    )
+
+    # for gnn inference
+    subgraph_loader = NeighborLoader(
+        data,
+        input_nodes=None,
+        num_neighbors=[-1],
+        batch_size=gnn_batch_size * 4,
+        num_workers=12,
+        persistent_workers=True,
+    )
+    # =========================== internal function ===========================
+
+    evaluator = Evaluator(name=f'ogbn-{dataset_name}')
+
+    def evaluate(out, split):
+        y_true = data.y.cpu()
+        y_pred = out.argmax(dim=-1, keepdim=True)
+        train_acc, val_acc, test_acc = None, None, None
+        if 'train' in split:
+            train_acc = evaluator.eval({
+                'y_true': y_true[split_idx['train']],
+                'y_pred': y_pred[split_idx['train']],
+            })['acc']
+        if 'valid' in split:
+            val_acc = evaluator.eval({
+                'y_true': y_true[split_idx['valid']],
+                'y_pred': y_pred[split_idx['valid']],
+            })['acc']
+        if 'test' in split:
+            test_acc = evaluator.eval({
+                'y_true': y_true[split_idx['test']],
+                'y_pred': y_pred[split_idx['test']],
+            })['acc']
+
+        return train_acc, val_acc, test_acc
+
+    # =========================== Build GNN Model =============================
+    gnn = None
+    if args.gnn_model == 'SAGE':
+        gnn = GraphSAGE(
+            in_channels=num_features,
+            hidden_channels=args.gnn_hidden_channels,
+            num_layers=args.gnn_num_layers,
+            out_channels=dataset.num_classes,
+        )
+    elif args.gnn_model == 'GAT':
+        gnn = GAT(in_channels=num_features,
+                  hidden_channels=args.gnn_hidden_channels,
+                  num_layers=args.gnn_num_layers,
+                  out_channels=dataset.num_classes, heads=args.gat_heads)
+    else:
+        gnn = GCN(
+            in_channels=num_features,
+            hidden_channels=args.gnn_hidden_channels,
+            num_layers=args.gnn_num_layers,
+            out_channels=dataset.num_classes,
+        )
+
+    print("# GNN Params:", get_n_params(gnn))
+    # =========================== Build LM Model ==============================
+
+    model = GLEM(lm_to_use=hf_model, gnn_to_use=gnn, out_channels=num_classes,
+                 lm_use_lora=lm_use_lora, device=device)
+    lm = model.lm
+    print("# LM Params:", get_n_params(lm))
+    gnn_opt = torch.optim.Adam(gnn.parameters(), lr=gnn_lr)
+    lm_opt = torch.optim.Adam(lm.parameters(), lr=lm_lr)
+
+    def load_model(em_phase):
+        print(f'Move {em_phase} model from cpu memory')
+        if em_phase == 'lm':
+            model.lm = model.lm.to(device, non_blocking=True)
+            optimizer = torch.optim.Adam(model.lm.parameters(), lr=lm_lr)
+        if em_phase == 'gnn':
+            model.gnn = model.gnn.to(device, non_blocking=True)
+            optimizer = torch.optim.Adam(model.gnn.parameters(), lr=gnn_lr)
+        return optimizer
+
+    # ================================= Run GLEM ==============================
+    preds_filename = 'lm_pretrain'
+    preds_dir = f'{out_dir}preds/{dataset_name}/'
+    gnn_test_acc = 0.0
+    lm_test_acc = 0.0
+    # =============================== GLEM pretraining ========================
+    pretrain_phase = 'lm'
+    if em_order == 'lm':
+        pretrain_phase = 'gnn'
+    pretrain_start_time = time.time()
+    # pretraining
+    pretrain_loader = graph_pretrain_loader
+    test_loader = subgraph_loader
+    pretrain_num_epochs = gnn_epochs
+    pretrain_opt = gnn_opt
+    if pretrain_phase == 'gnn':
+        model.gnn = model.gnn.to(device)
+        print('pretraining gnn to generate pseudo labels')
+        if not train_without_ext_pred:
+            pretrain_loader = graph_train_loader
+        preds_filename = 'gnn_pretrain'
+    elif pretrain_phase == 'lm':
+        model.lm = model.lm.to(device)
+        print('pretraining lm to generate pseudo labels')
+        pretrain_num_epochs = lm_epochs
+        pretrain_loader = text_pretrain_loader
+        test_loader = text_test_loader
+        pretrain_opt = lm_opt
+        if not train_without_ext_pred:
+            pretrain_loader = text_train_loader
+        preds_filename = 'lm_pretrain'
+
+    early_stopping = 0
+    best_val_acc = 0.0
+    for epoch in range(1, pretrain_num_epochs + 1):
+        acc, loss = model.train(pretrain_phase, pretrain_loader, pretrain_opt,
+                                ext_pseudo_labels, epoch, pretrain_augmented,
+                                verbose)
+        if epoch >= 5 or epoch == pretrain_num_epochs:
+            pretrain_preds = model.inference(pretrain_phase, test_loader,
+                                             verbose=verbose)
+            train_acc, val_acc, _ = evaluate(pretrain_preds,
+                                             ['train', 'valid'])
+
+            print(f'Train: {train_acc:.4f}, Val: {val_acc:.4f}')
+
+            if val_acc <= best_val_acc:
+                early_stopping += 1
+                if early_stopping > patience:
+                    print(f'Pretrain Early stopped by Epoch: {epoch}')
+                    break
+            else:
+                best_val_acc = val_acc
+    preds = model.inference(pretrain_phase, test_loader, verbose=verbose)
+    train_acc, val_acc, test_acc = evaluate(preds, ['train', 'valid', 'test'])
+    if pretrain_phase == 'gnn':
+        gnn_test_acc = max(gnn_test_acc, test_acc)
+        model.gnn = model.gnn.to('cpu', non_blocking=True)
+    else:
+        lm_test_acc = max(lm_test_acc, test_acc)
+        model.lm = model.lm.to('cpu', non_blocking=True)
+    torch.cuda.empty_cache()
+
+    pretrain_phase_time = time.time() - pretrain_start_time
+    print(f'Pretrain {pretrain_phase} time: {pretrain_phase_time:.2f}s')
+    os.makedirs(osp.dirname(preds_dir), exist_ok=True)
+    torch.save(preds, osp.join(preds_dir, f'{preds_filename}.pt'))
+    print(
+        f'Saved predictions to {osp.join(preds_dir, f"{preds_filename}.pt")}')
+    train_acc, val_acc, test_acc = evaluate(preds, ['train', 'valid', 'test'])
+    print(f'Pretraining acc: {train_acc:.4f}, Val: {val_acc:.4f}, '
+          f'Test: {test_acc:.4f}')
+
+    # EM iterations
+
+    em_phase = em_order
+    """
+    We run E-step(LM training) and M-Step(GNN training) alternatively in each
+    em iterations, so the total number of iterations is num_em_iter * 2 and
+    we switch the em_phase at end of each iteration in following loop
+    """
+    gnn_val_acc = lm_val_acc = 0.0
+    for em_it in range(1, num_em_iters * 2 + 1):
+        pseudo_labels = preds.argmax(dim=-1)
+        best_val_acc = 0.0
+        print(f'EM iteration: {em_it}, EM phase: {em_phase}')
+        optimizer = load_model(em_phase)
+        num_epochs = lm_epochs
+        train_loader = text_train_loader
+        test_loader = text_test_loader
+        early_stopping = 0
+        if em_phase == 'gnn':
+            train_loader = graph_train_loader
+            num_epochs = gnn_epochs
+            test_loader = subgraph_loader
+        for epoch in range(1, num_epochs + 1):
+            acc, loss = model.train(em_phase, train_loader, optimizer,
+                                    pseudo_labels, epoch, True, verbose)
+            if epoch >= 5 or epoch == num_epochs:
+                cur_preds = model.inference(em_phase, test_loader,
+                                            verbose=verbose)
+                train_acc, val_acc, _ = evaluate(cur_preds, ['train', 'valid'])
+
+                print(f'Train: {train_acc:.4f}, Val: {val_acc:.4f},')
+
+                if val_acc <= best_val_acc:
+                    early_stopping += 1
+                    if early_stopping > patience:
+                        print(f'''Early stopped by Epoch: {epoch}, \
+                            Best acc: {best_val_acc}''')
+                        break
+                else:
+                    best_val_acc = val_acc
+
+        preds = model.inference(em_phase, test_loader, verbose=verbose)
+        if em_phase == 'gnn':
+            gnn_val_acc = max(gnn_val_acc, best_val_acc)
+            model.gnn = model.gnn.to('cpu', non_blocking=True)
+            em_phase = 'lm'
+        else:
+            lm_val_acc = max(lm_val_acc, best_val_acc)
+            model.lm = model.lm.to('cpu', non_blocking=True)
+            em_phase = 'gnn'
+        torch.cuda.empty_cache()
+    print(f'Best GNN validation acc: {gnn_val_acc},'
+          f'LM validation acc: {lm_val_acc}')
+    print('============================')
+    if gnn_val_acc > lm_val_acc:
+        em_phase = 'gnn'
+        model.gnn = model.gnn.to(device, non_blocking=True)
+    else:
+        em_phase = 'lm'
+        model.lm = model.lm.to(device, non_blocking=True)
+    test_preds = model.inference(em_phase, test_loader, verbose=verbose)
+    train_acc, val_acc, test_acc = evaluate(test_preds,
+                                            ['train', 'valid', 'test'])
+    final_test_acc = max(gnn_test_acc, max(lm_test_acc, test_acc))
+    print(f'Best test acc: {final_test_acc}, model: {em_phase}')
+    end_time = time.time()
+    running_time = (end_time - start_time) / 3600
+    print(f'Total running time: {running_time:.2f} hours')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='GLEM Example:')
+    parser.add_argument('--gpu', type=int, default=0)
+    parser.add_argument('--num_runs', type=int, default=10,
+                        help='number of runs')
+    parser.add_argument('--num_em_iters', type=int, default=1,
+                        help='number of iterations')
+    parser.add_argument("--dataset", type=str, default='products',
+                        help='arxiv or products')
+    parser.add_argument("--pl_ratio", type=float, default=0.5,
+                        help="pseudo labels ratio")
+    parser.add_argument('--hf_model', type=str, default='prajjwal1/bert-tiny',
+                        help='huggingface model repo id')
+    parser.add_argument(
+        '--gnn_model', type=str, default='SAGE',
+        help='gnn model for node classification,'
+        'options: SAGE, GAT, GCN')
+    parser.add_argument('--gnn_hidden_channels', type=int, default=256)
+    parser.add_argument('--gnn_num_layers', type=int, default=3)
+    parser.add_argument('--gat_heads', type=int, default=4,
+                        help='Number of multi-head-attentions for GAT ')
+    parser.add_argument('--lm_batch_size', type=int, default=256)
+    parser.add_argument('--gnn_batch_size', type=int, default=1024)
+    parser.add_argument(
+        '--external_pred_path', type=str, default=None,
+        help="Other model's output logits during the "
+        "pretraining phase or simply concatenate it with"
+        "node features as augmented data for gnn")
+    parser.add_argument('--alpha', type=float, default=0.5,
+                        help='pseudo label weight in E-step')
+    parser.add_argument('--beta', type=float, default=0.5,
+                        help='pseudo label weight in M-step')
+    parser.add_argument('--lm_epochs', type=int, default=10)
+    parser.add_argument('--gnn_epochs', type=int, default=50)
+    parser.add_argument('--gnn_lr', type=float, default=0.002)
+    parser.add_argument('--lm_lr', type=float, default=0.001)
+    parser.add_argument('--patience', type=int, default=3,
+                        help='Patience for early stopping')
+    parser.add_argument('--verbose', action='store_true',
+                        help='show progress bar during training or not')
+    parser.add_argument('--em_order', type=str, default='lm',
+                        help='decide train LM first or GNN first')
+    parser.add_argument('--lm_use_lora', action='store_true',
+                        help='use Lora to fine-tune model or not')
+    parser.add_argument(
+        '--token_on_disk', action='store_true',
+        help='save token on disk and load token from disk'
+        'for reducing duplicated tokenizing')
+    parser.add_argument('--out_dir', type=str, default='output/',
+                        help='output directory')
+    parser.add_argument(
+        '--train_without_ext_pred', action='store_true',
+        help='train glem without using additional pseudo labels '
+        'for augmenting data only available for ogbn-products')
+    args = parser.parse_args()
+    print(args)
+    main(args)
diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py
index 96d51032d818..0b6569d3f92b 100644
--- a/torch_geometric/datasets/__init__.py
+++ b/torch_geometric/datasets/__init__.py
@@ -77,6 +77,7 @@
 from .brca_tgca import BrcaTcga
 from .neurograph import NeuroGraphDataset
 from .web_qsp_dataset import WebQSPDataset
+from .tag_dataset import TAGDataset
 
 from .dbp15k import DBP15K
 from .aminer import AMiner
@@ -190,6 +191,7 @@
     'BrcaTcga',
     'NeuroGraphDataset',
     'WebQSPDataset',
+    'TAGDataset',
 ]
 
 hetero_datasets = [
diff --git a/torch_geometric/datasets/tag_dataset.py b/torch_geometric/datasets/tag_dataset.py
new file mode 100644
index 000000000000..f25992ced989
--- /dev/null
+++ b/torch_geometric/datasets/tag_dataset.py
@@ -0,0 +1,350 @@
+import os
+import os.path as osp
+from collections.abc import Sequence
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from tqdm import tqdm
+
+from torch_geometric.data import InMemoryDataset, download_google_url
+from torch_geometric.data.data import BaseData
+
+try:
+    from pandas import DataFrame, read_csv
+    WITH_PANDAS = True
+except ImportError:
+    WITH_PANDAS = False
+
+IndexType = Union[slice, Tensor, np.ndarray, Sequence]
+
+
+class TAGDataset(InMemoryDataset):
+    r"""The Text Attributed Graph datasets from the
+    `"Learning on Large-scale Text-attributed Graphs via Variational Inference
+    " <https://arxiv.org/abs/2210.14709>`_ paper.
+    This dataset is aiming on transform `ogbn products`, `ogbn arxiv`
+    into Text Attributed Graph that each node in graph is associate with a
+    raw text, that dataset can be adapt to DataLoader (for LM training) and
+    NeighborLoader(for GNN training). In addition, this class can be use as a
+    wrapper class by convert a InMemoryDataset with Tokenizer and text into
+    Text Attributed Graph.
+
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        dataset (InMemoryDataset): The name of the dataset
+            (:obj:`"ogbn-products"`, :obj:`"ogbn-arxiv"`).
+        tokenizer_name (str): The tokenizer name for language model,
+            Be sure to use same tokenizer name as your `model id` of model repo
+            on huggingface.co.
+        text (List[str]): list of raw text associate with node, the order of
+            list should be align with node list
+        split_idx (Optional[Dict[str, torch.Tensor]]): Optional dictionary,
+            for saving split index, it is required that if your dataset doesn't
+            have get_split_idx function
+        tokenize_batch_size (int): batch size of tokenizing text, the
+            tokenizing process will run on cpu, default: 256
+        token_on_disk (bool): save token as .pt file on disk or not,
+            default: False
+        text_on_disk (bool): save given text(list of str) as dataframe on disk
+            or not, default: False
+        force_reload (bool): default: False
+    .. note::
+        See `example/llm_plus_gnn/glem.py` for example usage
+    """
+    raw_text_id = {
+        'ogbn-arxiv': '1g3OOVhRyiyKv13LY6gbp8GLITocOUr_3',
+        'ogbn-products': '1I-S176-W4Bm1iPDjQv3hYwQBtxE0v8mt'
+    }
+
+    def __init__(self, root: str, dataset: InMemoryDataset,
+                 tokenizer_name: str, text: Optional[List[str]] = None,
+                 split_idx: Optional[Dict[str, Tensor]] = None,
+                 tokenize_batch_size: int = 256, token_on_disk: bool = False,
+                 text_on_disk: bool = False,
+                 force_reload: bool = False) -> None:
+        # list the vars you want to pass in before run download & process
+        self.name = dataset.name
+        self.text = text
+        self.tokenizer_name = tokenizer_name
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.dir_name = '_'.join(dataset.name.split('-'))
+        self.root = osp.join(root, self.dir_name)
+        missing_str_list = []
+        if not WITH_PANDAS:
+            missing_str_list.append('pandas')
+        if len(missing_str_list) > 0:
+            missing_str = ' '.join(missing_str_list)
+            error_out = f"`pip install {missing_str}` to use this dataset."
+            raise ImportError(error_out)
+        if hasattr(dataset, 'get_idx_split'):
+            self.split_idx = dataset.get_idx_split()
+        elif split_idx is not None:
+            self.split_idx = split_idx
+        else:
+            raise ValueError("TAGDataset need split idx for generating "
+                             "is_gold mask, please pass splited index "
+                             "in format of dictionaty with 'train', 'valid' "
+                             "'test' index tensor to 'split_idx'")
+        if text is not None and text_on_disk:
+            self.save_node_text(text)
+        self.text_on_disk = text_on_disk
+        # init will call download and process
+        super().__init__(self.root, transform=None, pre_transform=None,
+                         pre_filter=None, force_reload=force_reload)
+        # after processing and download
+        # Dataset has to have BaseData as _data
+        assert dataset._data is not None
+        self._data = dataset._data  # reassign reference
+        assert self._data is not None
+        assert dataset._data.y is not None
+        assert isinstance(self._data, BaseData)
+        assert self._data.num_nodes is not None
+        assert isinstance(dataset._data.num_nodes, int)
+        assert isinstance(self._data.num_nodes, int)
+        self._n_id = torch.arange(self._data.num_nodes)
+        is_good_tensor = self.load_gold_mask()
+        self._is_gold = is_good_tensor.squeeze()
+        self._data['is_gold'] = is_good_tensor
+        if self.text is not None and len(self.text) != self._data.num_nodes:
+            raise ValueError("The number of text sequence in 'text' should be "
+                             "equal to number of nodes!")
+        self.token_on_disk = token_on_disk
+        self.tokenize_batch_size = tokenize_batch_size
+        self._token = self.tokenize_graph(self.tokenize_batch_size)
+        self.__num_classes__ = dataset.num_classes
+
+    @property
+    def num_classes(self) -> int:
+        return self.__num_classes__
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        file_names = []
+        for root, _, files in os.walk(osp.join(self.root, 'raw')):
+            for file in files:
+                file_names.append(file)
+        return file_names
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            'geometric_data_processed.pt', 'pre_filter.pt',
+            'pre_transformed.pt'
+        ]
+
+    @property
+    def token(self) -> Dict[str, Tensor]:
+        if self._token is None:  # lazy load
+            self._token = self.tokenize_graph()
+        return self._token
+
+    # load is_gold after init
+    @property
+    def is_gold(self) -> Tensor:
+        if self._is_gold is None:
+            print('lazy load is_gold!!')
+            self._is_gold = self.load_gold_mask()
+        return self._is_gold
+
+    def get_n_id(self, node_idx: IndexType) -> Tensor:
+        if self._n_id is None:
+            assert self._data is not None
+            assert self._data.num_nodes is not None
+            assert isinstance(self._data.num_nodes, int)
+            self._n_id = torch.arange(self._data.num_nodes)
+        return self._n_id[node_idx]
+
+    def load_gold_mask(self) -> Tensor:
+        r"""Use original train split as gold split, generating is_gold mask
+        for picking ground truth labels and pseudo labels.
+        """
+        train_split_idx = self.get_idx_split()['train']
+        assert self._data is not None
+        assert self._data.num_nodes is not None
+        assert isinstance(self._data.num_nodes, int)
+        is_good_tensor = torch.zeros(self._data.num_nodes,
+                                     dtype=torch.bool).view(-1, 1)
+        is_good_tensor[train_split_idx] = True
+        return is_good_tensor
+
+    def get_gold(self, node_idx: IndexType) -> Tensor:
+        r"""Get gold mask for given node_idx.
+
+        Args:
+            node_idx (torch.tensor): a tensor contain node idx
+        """
+        if self._is_gold is None:
+            self._is_gold = self.is_gold
+        return self._is_gold[node_idx]
+
+    def get_idx_split(self) -> Dict[str, Tensor]:
+        return self.split_idx
+
+    def download(self) -> None:
+        print('downloading raw text')
+        raw_text_path = download_google_url(id=self.raw_text_id[self.name],
+                                            folder=f'{self.root}/raw',
+                                            filename='node-text.csv.gz',
+                                            log=True)
+        text_df = read_csv(raw_text_path)
+        self.text = list(text_df['text'])
+
+    def process(self) -> None:
+        if osp.exists(osp.join(self.root, 'raw', 'node-text.csv.gz')):
+            text_df = read_csv(osp.join(self.root, 'raw', 'node-text.csv.gz'))
+            self.text = list(text_df['text'])
+        elif self.name in self.raw_text_id:
+            self.download()
+        else:
+            print('The dataset is not ogbn-products nor ogbn-arxiv,'
+                  'please pass in your raw text string list to `text`')
+        if self.text is None:
+            raise ValueError("The TAGDataset only have ogbn-products and "
+                             "ogbn-arxiv raw text in default "
+                             "The raw text of each node is not specified"
+                             "Please pass in 'text' when convert your dataset "
+                             "to Text Attribute Graph Dataset")
+
+    def save_node_text(self, text: List[str]) -> None:
+        node_text_path = osp.join(self.root, 'raw', 'node-text.csv.gz')
+        if osp.exists(node_text_path):
+            print(f'The raw text is existed at {node_text_path}')
+        else:
+            print(f'Saving raw text file at {node_text_path}')
+            os.makedirs(f'{self.root}/raw', exist_ok=True)
+            text_df = DataFrame(text, columns=['text'])
+            text_df.to_csv(osp.join(node_text_path), compression='gzip',
+                           index=False)
+
+    def tokenize_graph(self, batch_size: int = 256) -> Dict[str, Tensor]:
+        r"""Tokenizing the text associate with each node, running in cpu.
+
+        Args:
+            batch_size (Optional[int]): batch size of list of text for
+                generating emebdding
+        Returns:
+            Dict[str, torch.Tensor]: tokenized graph
+        """
+        data_len = 0
+        if self.text is not None:
+            data_len = len(self.text)
+        else:
+            raise ValueError("The TAGDataset need text for tokenization")
+        token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+        path = os.path.join(self.processed_dir, 'token', self.tokenizer_name)
+        # Check if the .pt files already exist
+        token_files_exist = any(
+            os.path.exists(os.path.join(path, f'{k}.pt')) for k in token_keys)
+
+        if token_files_exist and self.token_on_disk:
+            print('Found tokenized file, loading may take several minutes...')
+            all_encoded_token = {
+                k: torch.load(os.path.join(path, f'{k}.pt'), weights_only=True)
+                for k in token_keys
+                if os.path.exists(os.path.join(path, f'{k}.pt'))
+            }
+            return all_encoded_token
+
+        all_encoded_token = {k: [] for k in token_keys}
+        pbar = tqdm(total=data_len)
+
+        pbar.set_description('Tokenizing Text Attributed Graph')
+        for i in range(0, data_len, batch_size):
+            end_index = min(data_len, i + batch_size)
+            token = self.tokenizer(self.text[i:min(i + batch_size, data_len)],
+                                   padding='max_length', truncation=True,
+                                   max_length=512, return_tensors="pt")
+            for k in token.keys():
+                all_encoded_token[k].append(token[k])
+            pbar.update(end_index - i)
+        pbar.close()
+
+        all_encoded_token = {
+            k: torch.cat(v)
+            for k, v in all_encoded_token.items() if len(v) > 0
+        }
+        if self.token_on_disk:
+            os.makedirs(path, exist_ok=True)
+            print('Saving tokens on Disk')
+            for k, tensor in all_encoded_token.items():
+                torch.save(tensor, os.path.join(path, f'{k}.pt'))
+                print('Token saved:', os.path.join(path, f'{k}.pt'))
+        os.environ["TOKENIZERS_PARALLELISM"] = 'true'  # supressing warning
+        return all_encoded_token
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}()'
+
+    class TextDataset(torch.utils.data.Dataset):
+        r"""This nested dataset provides textual data for each node in
+        the graph. Factory method to create TextDataset from TAGDataset.
+
+        Args:
+            tag_dataset (TAGDataset): the parent dataset
+        """
+        def __init__(self, tag_dataset: 'TAGDataset') -> None:
+            self.tag_dataset = tag_dataset
+            self.token = tag_dataset.token
+            assert tag_dataset._data is not None
+            self._data = tag_dataset._data
+
+            assert tag_dataset._data.y is not None
+            self.labels = tag_dataset._data.y
+
+        def get_token(self, node_idx: IndexType) -> Dict[str, Tensor]:
+            r"""This function will be called in __getitem__().
+
+            Args:
+                node_idx (IndexType): selected node idx in each batch
+            Returns:
+                items (Dict[str, Tensor]): input for LM
+            """
+            items = {k: v[node_idx] for k, v in self.token.items()}
+            return items
+
+        # for LM training
+        def __getitem__(
+                self, node_id: IndexType
+        ) -> Dict[str, Union[Tensor, Dict[str, Tensor]]]:
+            r"""This function will override the function in
+            torch.utils.data.Dataset, and will be called when you
+            iterate batch in the dataloader, make sure all following
+            key value pairs are present in the return dict.
+
+            Args:
+                node_id (List[int]): list of node idx for selecting tokens,
+                    labels etc. when iterating data loader for LM
+            Returns:
+                items (dict): input k,v pairs for Language model training and
+                    inference
+            """
+            item: Dict[str, Union[Tensor, Dict[str, Tensor]]] = {}
+            item['input'] = self.get_token(node_id)
+            item['labels'] = self.labels[node_id]
+            item['is_gold'] = self.tag_dataset.get_gold(node_id)
+            item['n_id'] = self.tag_dataset.get_n_id(node_id)
+            return item
+
+        def __len__(self) -> int:
+            assert self._data.num_nodes is not None
+            return self._data.num_nodes
+
+        def get(self, idx: int) -> BaseData:
+            return self._data
+
+        def __repr__(self) -> str:
+            return f'{self.__class__.__name__}()'
+
+    def to_text_dataset(self) -> TextDataset:
+        r"""Factory Build text dataset from Text Attributed Graph Dataset
+        each data point is node's associated text token.
+        """
+        return TAGDataset.TextDataset(self)
diff --git a/torch_geometric/nn/models/__init__.py b/torch_geometric/nn/models/__init__.py
index 7cfadf0143b2..5860db311ac3 100644
--- a/torch_geometric/nn/models/__init__.py
+++ b/torch_geometric/nn/models/__init__.py
@@ -29,7 +29,7 @@
 from .neural_fingerprint import NeuralFingerprint
 from .visnet import ViSNet
 from .g_retriever import GRetriever
-
+from .glem import GLEM
 # Deprecated:
 from torch_geometric.explain.algorithm.captum import (to_captum_input,
                                                       captum_output_to_dicts)
@@ -77,4 +77,5 @@
     'NeuralFingerprint',
     'ViSNet',
     'GRetriever',
+    'GLEM',
 ]
diff --git a/torch_geometric/nn/models/glem.py b/torch_geometric/nn/models/glem.py
new file mode 100644
index 000000000000..afc8b09d77c7
--- /dev/null
+++ b/torch_geometric/nn/models/glem.py
@@ -0,0 +1,384 @@
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from torch_geometric.loader import DataLoader, NeighborLoader
+from torch_geometric.nn.models import GraphSAGE, basic_gnn
+
+
+class GLEM(torch.nn.Module):
+    r"""This GNN+LM co-training model is based on GLEM from the `"Learning on
+    Large-scale Text-attributed Graphs via Variational Inference"
+    <https://arxiv.org/abs/2210.14709>`_ paper.
+
+    Args:
+        lm_to_use (str): A TextEncoder from huggingface model repo
+                with a classifier(default: TinyBERT)
+        gnn_to_use (torch_geometric.nn.models): (default: GraphSAGE)
+        out_channels (int): output channels for LM and GNN, should be same
+        num_gnn_heads Optional[int]: Number of heads for attention, if needed
+        num_gnn_layers (int): number of gnn layers
+        gnn_loss: loss function for gnn, (default: CrossEntropyLoss)
+        lm_loss: loss function for Language Model, (default: CrossEntropyLoss)
+        alpha (float): pseudo label weight of E-step, LM optimization,
+            (default: 0.5)
+        beta (float): pseudo label weight of M-step, GNN optimization,
+            (default: 0.5)
+        lm_dtype (torch.dtype): the data type once you load LM into memory,
+            (default: torch.bfloat16)
+        lm_use_lora (bool): choose if LM use Lora peft for fine tune,
+            (default: True)
+        lora_target_modules: The names of the target modules to apply the lora
+            adapter to, e.g. ['q_proj', 'v_proj'] for LLM , (default: None)
+
+    .. note::
+        See `examples/llm_plus_gnn/glem.py` for example usage.
+    """
+    def __init__(
+            self,
+            lm_to_use: str = 'prajjwal1/bert-tiny',
+            gnn_to_use: basic_gnn = GraphSAGE,
+            out_channels: int = 47,
+            gnn_loss=nn.CrossEntropyLoss(reduction='mean'),
+            lm_loss=nn.CrossEntropyLoss(reduction='mean'),
+            alpha: float = 0.5,
+            beta: float = 0.5,
+            lm_dtype: torch.dtype = torch.bfloat16,
+            lm_use_lora: bool = True,
+            lora_target_modules: Optional[Union[List[str], str]] = None,
+            device: Union[str, torch.device] = torch.device('cpu'),
+    ):
+        super().__init__()
+        self.device = device
+        self.lm_loss = lm_loss
+        self.gnn = gnn_to_use
+        self.gnn_loss = gnn_loss
+        self.alpha = alpha
+        self.beta = beta
+        self.gnn_loss = gnn_loss
+        self.lm = lm_to_use
+        from transformers import AutoModelForSequenceClassification
+        self.lm = AutoModelForSequenceClassification.from_pretrained(
+            lm_to_use, num_labels=out_channels, torch_dtype=lm_dtype,
+            offload_folder="offload", trust_remote_code=True)
+        if lm_use_lora:
+            from peft import (
+                LoraConfig,
+                TaskType,
+                get_peft_model,
+                prepare_model_for_kbit_training,
+            )
+            print("Training LM with LORA!")
+            self.lm = prepare_model_for_kbit_training(self.lm)
+            config = LoraConfig(task_type=TaskType.SEQ_CLS, r=16,
+                                lora_alpha=16, lora_dropout=0.05, bias="none",
+                                target_modules=lora_target_modules)
+            self.lm = get_peft_model(self.lm, config)
+            self.lm.print_trainable_parameters()
+        self.lm.config.pad_token_id = self.lm.config.eos_token_id
+        self.lm_device = self.lm.device
+
+        if self.lm.num_labels != self.gnn.out_channels:
+            raise ValueError('''The output channel of language model \
+                             and gnn should be the same''')
+
+    def pre_train_gnn(self, train_loader: NeighborLoader,
+                      optimizer: torch.optim.Optimizer, num_epochs: int,
+                      patience: int, ext_pseudo_labels: torch.Tensor = None,
+                      is_augmented: bool = False, verbose: bool = True):
+        # Pretrain GNN, optional steps if you do not have pseudo labels.
+        best_acc = 0
+        early_stopping = 0
+        # training only based on gold data
+        for epoch in range(0, num_epochs):
+            acc, loss = self.train_gnn(train_loader, optimizer, epoch,
+                                       ext_pseudo_labels, is_augmented,
+                                       verbose)
+            if acc < best_acc:
+                early_stopping += 1
+                if early_stopping > patience:
+                    print(f'Early stopped by Epoch: {epoch}, '
+                          f'Best acc: {best_acc}')
+                    break
+            best_acc = max(best_acc, acc)
+
+    def pre_train_lm(self, train_loader: DataLoader,
+                     optimizer: torch.optim.Optimizer, num_epochs: int,
+                     patience: int, ext_pseudo_labels: torch.Tensor = None,
+                     is_augmented: bool = False, verbose: bool = True):
+        # Pretrain language model
+        best_acc = 0
+        early_stopping = 0
+        for epoch in range(1, num_epochs + 1):
+            acc, loss = self.train_lm(train_loader, optimizer, epoch,
+                                      ext_pseudo_labels, is_augmented, verbose)
+            if acc < best_acc:
+                early_stopping += 1
+                if early_stopping > patience:
+                    print(f'Early stopped by Epoch: {epoch}, '
+                          f'Best acc: {best_acc}')
+                    break
+            best_acc = max(best_acc, acc)
+
+    def train(self, em_phase: str, train_loader: Union[DataLoader,
+                                                       NeighborLoader],
+              optimizer: torch.optim.Optimizer, pseudo_labels: torch.Tensor,
+              epoch: int, is_augmented: bool = False, verbose: bool = False):
+        r"""GLEM training step, EM steps.
+
+        Args:
+            em_phase(str): 'gnn' or 'lm' choose which phase you are training on
+            train_loader(Union[DataLoader, NeighborLoader]): use DataLoader for
+                lm training, include tokenized data, labels is_gold mask.
+                use NeighborLoader for gnn training, include x, edge_index.
+            optimizer (torch.optim.Optimizer): optimizer for training
+            pseudo_labels(torch.Tensor): the predicted labels used as pseudo
+                labels
+            epoch (int): current epoch
+            is_augmented (bool): will use pseudo_labels or not
+            verbose (bool): print training progress bar or not
+
+        Returns:
+            acc (float): training accuracy
+            loss (float): loss value
+        """
+        pseudo_labels = pseudo_labels.to(self.device)
+        if em_phase == 'gnn':
+            acc, loss = self.train_gnn(train_loader, optimizer, epoch,
+                                       pseudo_labels, is_augmented, verbose)
+        if em_phase == 'lm':
+            acc, loss = self.train_lm(train_loader, optimizer, epoch,
+                                      pseudo_labels, is_augmented, verbose)
+        return acc, loss
+
+    def train_lm(self, train_loader: DataLoader,
+                 optimizer: torch.optim.Optimizer, epoch: int,
+                 pseudo_labels: torch.Tensor = None,
+                 is_augmented: bool = False, verbose: bool = True):
+        r"""Language model Training in every epoch.
+
+        Args:
+            train_loader (loader.dataloader.DataLoader): text token dataloader
+            optimizer (torch.optim.Optimizer): model optimizer
+            epoch (int): current train epoch
+            pseudo_labels (torch.Tensor): 1-D tensor, predictions from gnn
+            is_augmented (bool): train with pseudo labels or not
+            verbose (bool): print training progress bar or not
+
+        Returns:
+            approx_acc (torch.tensor): training accuracy
+            loss (torch.float): loss value
+
+        """
+        all_out = []
+        total_loss = total_correct = 0
+        num_nodes = train_loader.dataset.indices.size(0)
+        self.lm.train()
+        if verbose:
+            pbar = tqdm(total=num_nodes)
+            pbar.set_description(f'Epoch {epoch:02d}')
+        for batch in train_loader:
+            inputs = {k: v.to(self.device) for k, v in batch['input'].items()}
+            out = self.lm(**inputs).logits
+            labels = batch['labels'].to(self.device).squeeze()
+            # training with pseudo labels or not
+            if is_augmented:
+                pl_batch = pseudo_labels[batch['n_id']].to(self.device)
+            else:
+                pl_batch = None
+            loss = self.loss(out, labels, self.lm_loss,
+                             batch['is_gold'].to(self.device), pl_batch,
+                             self.alpha, is_augmented)
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            all_out.append(out)
+            total_correct += int(out.argmax(dim=-1).eq(labels).sum())
+            total_loss += float(loss)
+            if verbose:
+                pbar.update(batch['n_id'].size(0))
+
+        all_out = torch.cat(all_out, dim=0)
+        approx_acc = total_correct / num_nodes
+        loss = total_loss / len(train_loader)
+        if verbose:
+            pbar.close()
+        print(f'Epoch {epoch:02d} Loss: {loss:.4f} '
+              f'Approx. Train: {approx_acc:.4f}')
+        return approx_acc, loss
+
+    def train_gnn(self, train_loader: NeighborLoader,
+                  optimizer: torch.optim.Optimizer, epoch: int,
+                  pseudo_labels: torch.Tensor = None,
+                  is_augmented: bool = False, verbose: bool = True):
+        r"""GNN training step in every epoch.
+
+        Args:
+            train_loader (loader.NeighborLoader): gnn Neighbor node loader
+            optimizer (torch.optim.Optimizer): model optimizer
+            epoch (int): current train epoch
+            pseudo_labels(torch.tensor): 1-D tensor, predictions from lm
+            is_augmented(bool): use pseudo labeled node or not
+            verbose (bool): print training progress or not
+
+        Returns:
+            approx_acc (torch.tensor): training accuracy
+            loss (torch.float): loss value
+        """
+        self.gnn.train()
+        num_nodes = train_loader.input_nodes.size(0)
+        if verbose:
+            pbar = tqdm(total=num_nodes)
+            pbar.set_description(f'Epoch {epoch:02d}')
+        total_loss = total_correct = 0
+        all_out = []
+        for batch in train_loader:
+            batch = batch.to(self.device)
+            out = self.gnn(batch.x, batch.edge_index)[:batch.batch_size]
+            all_out.append(out)
+            labels = batch.y[:batch.batch_size].squeeze()
+            is_gold_batch = batch.is_gold[:batch.batch_size].squeeze()
+            # training with pseudo labels or not
+            if is_augmented and pseudo_labels is not None:
+                pl_batch = pseudo_labels[batch.n_id[:batch.batch_size]]
+            else:
+                pl_batch = None
+            loss = self.loss(out, labels, self.gnn_loss, is_gold_batch,
+                             pl_batch, self.beta, is_augmented)
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += float(loss)
+            total_correct += int(out.argmax(dim=-1).eq(labels).sum())
+            if verbose:
+                pbar.update(batch.batch_size)
+
+        all_out = torch.cat(all_out, dim=0)
+        loss = total_loss / len(train_loader)
+        approx_acc = total_correct / num_nodes
+        if verbose:
+            pbar.close()
+        print(f'Epoch: {epoch:02d} Loss: {loss:.4f} '
+              f'Approx. Train: {approx_acc:.4f}')
+        return approx_acc, loss
+
+    @torch.no_grad()
+    def inference(self, em_phase: str, data_loader: Union[NeighborLoader,
+                                                          DataLoader],
+                  verbose: bool = False):
+        r"""GLEM inference step.
+
+        Args:
+            em_phase(str): 'gnn' or 'lm'
+            data_loader(dataloader or Neighborloader):
+                dataloader: for lm training, include tokenized data
+                nodeloader: for gnn training, include x, edge_index
+            verbose(bool): print inference progress or not
+
+        Returns:
+            out (torch.Tensor): n * m tensor, m is number of classes,
+                n is number of nodes
+        """
+        out = None
+        if em_phase == 'gnn':
+            self.gnn.eval()
+            out = self.inference_gnn(data_loader, verbose)
+        elif em_phase == 'lm':
+            self.lm.eval()
+            out = self.inference_lm(data_loader, verbose)
+        return out
+
+    @torch.no_grad()
+    def inference_lm(self, data_loader: DataLoader, verbose: bool = True):
+        r"""LM inference step.
+
+        Args:
+            data_loader (Dataloader): include token, labels, and gold mask
+            verbose (bool): print progress bar or not
+
+        Returns:
+            preds (tensor): prediction from GNN, convert to pseudo labels
+                by preds.argmax(dim=-1).unsqueeze(1)
+        """
+        if verbose:
+            pbar = tqdm(total=data_loader.dataset._data.num_nodes)
+            pbar.set_description('LM inference stage')
+        self.lm.eval()
+        preds = []
+        for batch in data_loader:
+            inputs = {k: v.to(self.device) for k, v in batch['input'].items()}
+            logits = self.lm(**inputs).logits
+            preds.append(logits)
+            if verbose:
+                pbar.update(batch['n_id'].size(0))
+        if verbose:
+            pbar.close()
+        preds = torch.cat(preds)
+        return preds
+
+    @torch.no_grad()
+    def inference_gnn(self, data_loader: NeighborLoader, verbose: bool = True):
+        r"""GNN inference step.
+
+        Args:
+            data_loader(NeighborLoader): include x, edge_index,
+            verbose (bool): print progress bar or not
+
+        Returns:
+            preds (tensor): prediction from GNN,
+                convert to pseudo labels by preds.argmax(dim=-1).unsqueeze(1)
+        """
+        if verbose:
+            pbar = tqdm(total=data_loader.data.num_nodes)
+            pbar.set_description('GNN inference stage')
+        preds = []
+        self.gnn.eval()
+        for batch in data_loader:
+            batch = batch.to(self.device)
+            out = self.gnn(batch.x, batch.edge_index)[:batch.batch_size]
+            preds.append(out)
+            if verbose:
+                pbar.update(batch.batch_size)
+        if verbose:
+            pbar.close()
+        preds = torch.cat(preds, dim=0)
+        return preds
+
+    def loss(self, logits: torch.Tensor, labels: torch.Tensor,
+             loss_func: torch.nn.functional, is_gold: torch.Tensor,
+             pseudo_labels: torch.Tensor = None, pl_weight: float = 0.5,
+             is_augmented: bool = True):
+        r"""Core function of variational EM inference, this function is aming
+        on combining loss value on gold(original train) and loss value on
+        pseudo labels.
+
+        Reference:
+        <https://github.com/AndyJZhao/GLEM/blob/main/src/models/GLEM/GLEM_utils.py> # noqa
+
+        Args:
+            logits(torch.tensor): predict results from LM or GNN
+            labels(torch.tensor): combined node labels from ground truth and
+                pseudo labels(if provided)
+            loss_func(torch.nn.modules.loss): loss function for classification
+            is_gold(tensor): a tensor with bool value that mask ground truth
+                    label and during training, thus ~is_gold mask pseudo labels
+            pseudo_labels(torch.tensor): predictions from other model
+            pl_weight: the pseudo labels used in E-step and M-step optimization
+                        alpha in E-step, beta in M-step respectively
+            is_augmented: use EM or just train GNN and LM with gold data
+
+        """
+        def deal_nan(x):
+            return 0 if torch.isnan(x) else x
+
+        if is_augmented and (sum(~is_gold) > 0):
+            mle_loss = deal_nan(loss_func(logits[is_gold], labels[is_gold]))
+            # all other labels beside from ground truth(gold labels)
+            pseudo_label_loss = deal_nan(
+                loss_func(logits[~is_gold], pseudo_labels[~is_gold]))
+            loss = pl_weight * pseudo_label_loss + (1 - pl_weight) * mle_loss
+        else:
+            loss = loss_func(logits, labels)
+        return loss

From 529237c15198d483f8a0b417cc03bc6391854cc2 Mon Sep 17 00:00:00 2001
From: xnuohz <xnuohz@126.com>
Date: Wed, 20 Nov 2024 09:44:31 +0800
Subject: [PATCH 03/10] Add MoleculeGPT (#9710)

### Issue
- #9694
- https://github.com/pyg-team/pytorch_geometric/issues/9698

### Feature Summary

- Add `MoleculeGPTDataset`
- Add `MoleculeGPT` as GNN & LLM Co-training model to PyG
- Add an example for training and testing
- Split the PR into 3 sub-PRs (#9723, #9724, #9725)
- Limited hardware resources, can't load `lmsys/vicuna-7b-v1.5`, use
`TinyLlama/TinyLlama-1.1B-Chat-v0.1` instead, and the full training
pipeline was not tested

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Giovanni Gatti <100698520+giovanni-gatti@users.noreply.github.com>
Co-authored-by: Rishi Puri <puririshi98@berkeley.edu>
---
 CHANGELOG.md                                  |   1 +
 examples/llm/README.md                        |   9 +-
 examples/llm/molecule_gpt.py                  | 193 +++++++
 test/datasets/test_molecule_gpt_dataset.py    |  10 +
 test/nn/attention/test_qformer.py             |  13 +
 test/nn/models/test_molecule_gpt.py           |  60 +++
 torch_geometric/datasets/__init__.py          |   2 +
 .../datasets/molecule_gpt_dataset.py          | 480 ++++++++++++++++++
 torch_geometric/nn/attention/__init__.py      |   6 +-
 torch_geometric/nn/attention/qformer.py       |  71 +++
 torch_geometric/nn/models/__init__.py         |   2 +
 torch_geometric/nn/models/molecule_gpt.py     | 222 ++++++++
 torch_geometric/nn/nlp/llm.py                 |   2 +-
 .../nn/nlp/sentence_transformer.py            |   3 +
 14 files changed, 1068 insertions(+), 6 deletions(-)
 create mode 100644 examples/llm/molecule_gpt.py
 create mode 100644 test/datasets/test_molecule_gpt_dataset.py
 create mode 100644 test/nn/attention/test_qformer.py
 create mode 100644 test/nn/models/test_molecule_gpt.py
 create mode 100644 torch_geometric/datasets/molecule_gpt_dataset.py
 create mode 100644 torch_geometric/nn/attention/qformer.py
 create mode 100644 torch_geometric/nn/models/molecule_gpt.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 91da66973cef..9240420208bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `MoleculeGPT` example ([#9710](https://github.com/pyg-team/pytorch_geometric/pull/9710))
 - Added `nn.models.GLEM` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
 - Added `TAGDataset` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
 - Added support for fast `Delaunay()` triangulation via the `torch_delaunay` package ([#9748](https://github.com/pyg-team/pytorch_geometric/pull/9748))
diff --git a/examples/llm/README.md b/examples/llm/README.md
index e0ac02d87f2e..d860232aa56b 100644
--- a/examples/llm/README.md
+++ b/examples/llm/README.md
@@ -1,6 +1,7 @@
 # Examples for Co-training LLMs and GNNs
 
-| Example                              | Description                                                                                                                                                                                    |
-| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`g_retriever.py`](./g_retriever.py) | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information                                    |
-| [`glem.py`](./glem.py)               | Example for [GLEM](https://arxiv.org/abs/2210.14709), a GNN+LLM co-training model via variational Expectation-Maximization (EM) framework on node classification tasks to achieve SOTA results |
+| Example                                | Description                                                                                                                                                                                    |
+| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`g_retriever.py`](./g_retriever.py)   | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information                                    |
+| [`molecule_gpt.py`](./molecule_gpt.py) | Example for MoleculeGPT: Instruction Following Large Language Models for Molecular Property Prediction                                                                                         |
+| [`glem.py`](./glem.py)                 | Example for [GLEM](https://arxiv.org/abs/2210.14709), a GNN+LLM co-training model via variational Expectation-Maximization (EM) framework on node classification tasks to achieve SOTA results |
diff --git a/examples/llm/molecule_gpt.py b/examples/llm/molecule_gpt.py
new file mode 100644
index 000000000000..8f6c6024014d
--- /dev/null
+++ b/examples/llm/molecule_gpt.py
@@ -0,0 +1,193 @@
+"""This example implements the MoleculeGPT model
+(https://ai4d3.github.io/papers/34.pdf) using PyG.
+"""
+import argparse
+import math
+import os.path as osp
+import time
+
+import torch
+from torch.nn.utils import clip_grad_norm_
+from tqdm import tqdm
+
+from torch_geometric import seed_everything
+from torch_geometric.datasets import MoleculeGPTDataset
+from torch_geometric.loader import DataLoader
+from torch_geometric.nn import GINEConv
+from torch_geometric.nn.models import MoleculeGPT
+from torch_geometric.nn.nlp import LLM, SentenceTransformer
+
+
+def save_params_dict(model, save_path):
+    state_dict = model.state_dict()
+    param_grad_dict = {
+        k: v.requires_grad
+        for (k, v) in model.named_parameters()
+    }
+    for k in list(state_dict.keys()):
+        if k in param_grad_dict.keys() and not param_grad_dict[k]:
+            del state_dict[k]  # Delete parameters that do not require gradient
+    torch.save(state_dict, save_path)
+
+
+@torch.no_grad()
+def eval(model, data_loader):
+    model.eval()
+    loss = 0
+
+    for batch in data_loader:
+        batch_loss = model(batch.x, batch.edge_index, batch.batch,
+                           batch.edge_attr, batch.smiles, batch.instruction,
+                           batch.y)
+        loss += batch_loss.item() / len(data_loader)
+    return loss
+
+
+def train(
+    num_epochs: int,
+    lr: float,
+    batch_size: int,
+    checkpointing: bool,
+):
+    def adjust_learning_rate(param_group, LR, epoch):
+        # Decay the learning rate with half-cycle cosine after warmup
+        min_lr = 5e-6
+        warmup_epochs = 1
+        if epoch < warmup_epochs:
+            lr = LR
+        else:
+            lr = min_lr + (LR - min_lr) * 0.5 * (
+                1.0 + math.cos(math.pi * (epoch - warmup_epochs) /
+                               (num_epochs - warmup_epochs)))
+        param_group['lr'] = lr
+        return lr
+
+    start_time = time.time()
+    # Load dataset ================================================
+    path = osp.dirname(osp.realpath(__file__))
+    path = osp.join(path, '..', '..', 'data', 'MoleculeGPT')
+    dataset = MoleculeGPTDataset(path)
+    train_size, val_size = int(0.8 * len(dataset)), int(0.1 * len(dataset))
+    train_dataset = dataset[:train_size]
+    val_dataset = dataset[train_size:train_size + val_size]
+    test_dataset = dataset[train_size + val_size:]
+
+    seed_everything(42)
+
+    train_loader = DataLoader(train_dataset, batch_size=batch_size,
+                              drop_last=True, pin_memory=True, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size,
+                            drop_last=False, pin_memory=True, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size,
+                             drop_last=False, pin_memory=True, shuffle=False)
+
+    # Create model ===============================================
+    llm = LLM(
+        # model_name='lmsys/vicuna-7b-v1.5',
+        model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+        num_params=1,
+        dtype=torch.bfloat16,
+    )
+
+    graph_encoder = GINEConv(
+        nn=torch.nn.Sequential(
+            torch.nn.Linear(6, 768),
+            torch.nn.ReLU(),
+            torch.nn.Linear(768, 768),
+        ),
+        train_eps=True,
+        edge_dim=4,
+    )
+
+    smiles_encoder = SentenceTransformer(
+        model_name='DeepChem/ChemBERTa-77M-MTR',
+        pooling_strategy='last_hidden_state',
+    )
+
+    model = MoleculeGPT(
+        llm=llm,
+        graph_encoder=graph_encoder,
+        smiles_encoder=smiles_encoder,
+    )
+
+    # Train and eval ============================================
+    params = [p for _, p in model.named_parameters() if p.requires_grad]
+    optimizer = torch.optim.AdamW([
+        {
+            'params': params,
+            'lr': lr,
+            'weight_decay': 0.05,
+        },
+    ], betas=(0.9, 0.95))
+    grad_steps = 2
+
+    best_epoch = 0
+    best_val_loss = float('inf')
+    for epoch in range(num_epochs):
+        # Train
+        model.train()
+        epoch_loss = 0
+        if epoch == 0:
+            print(f"Total Preparation Time: {time.time() - start_time:2f}s")
+            start_time = time.time()
+            print("Training beginning...")
+        epoch_str = f'Epoch: {epoch + 1}|{num_epochs}'
+        loader = tqdm(train_loader, desc=epoch_str)
+
+        for step, batch in enumerate(loader):
+            optimizer.zero_grad()
+            loss = model(batch.x, batch.edge_index, batch.batch,
+                         batch.edge_attr, batch.smiles, batch.instruction,
+                         batch.y)
+            loss.backward()
+            clip_grad_norm_(optimizer.param_groups[0]['params'], 0.1)
+
+            if (step + 1) % grad_steps == 0:
+                adjust_learning_rate(optimizer.param_groups[0], lr,
+                                     step / len(train_loader) + epoch)
+
+            optimizer.step()
+            epoch_loss += loss.item()
+
+            if (step + 1) % grad_steps == 0:
+                lr = optimizer.param_groups[0]['lr']
+        train_loss = epoch_loss / len(train_loader)
+
+        # Eval
+        val_loss = eval(model, val_loader)
+        print(
+            f'{epoch_str}, Train loss: {train_loss:4f}, Val loss: {val_loss:4f}'  # noqa: E501
+        )
+
+        if checkpointing and val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_epoch = epoch
+            save_params_dict(
+                model,
+                f'moleculegpt_epoch{best_epoch}_val_loss{best_val_loss:4f}_ckpt.pt'  # noqa: E501
+            )
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+
+    print(f"Total Training Time: {time.time() - start_time:2f}s")
+    # Test
+    test_loss = eval(model, test_loader)
+    print(f'Test loss: {test_loss:4f}')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=3)
+    parser.add_argument('--lr', type=float, default=1e-5)
+    parser.add_argument('--batch_size', type=int, default=2)
+    parser.add_argument('--checkpointing', type=bool, default=True)
+    args = parser.parse_args()
+
+    start_time = time.time()
+    train(
+        args.epochs,
+        args.lr,
+        args.batch_size,
+        args.checkpointing,
+    )
+    print(f'Total Time: {time.time() - start_time:2f}s')
diff --git a/test/datasets/test_molecule_gpt_dataset.py b/test/datasets/test_molecule_gpt_dataset.py
new file mode 100644
index 000000000000..7c00c5efc1b6
--- /dev/null
+++ b/test/datasets/test_molecule_gpt_dataset.py
@@ -0,0 +1,10 @@
+from torch_geometric.datasets import MoleculeGPTDataset
+from torch_geometric.testing import withPackage
+
+
+@withPackage('transformers', 'sentencepiece', 'accelerate', 'rdkit')
+def test_molecule_gpt_dataset():
+    dataset = MoleculeGPTDataset(root='./data/MoleculeGPT')
+    assert str(dataset) == f'MoleculeGPTDataset({len(dataset)})'
+    assert dataset.num_edge_features == 4
+    assert dataset.num_node_features == 6
diff --git a/test/nn/attention/test_qformer.py b/test/nn/attention/test_qformer.py
new file mode 100644
index 000000000000..0de023708fd8
--- /dev/null
+++ b/test/nn/attention/test_qformer.py
@@ -0,0 +1,13 @@
+import torch
+
+from torch_geometric.nn.attention import QFormer
+
+
+def test_qformer():
+    x = torch.randn(1, 4, 16)
+    attn = QFormer(input_dim=16, hidden_dim=16, output_dim=32, num_heads=4,
+                   num_layers=2)
+    out = attn(x)
+
+    assert out.shape == (1, 4, 32)
+    assert str(attn) == ('QFormer(num_heads=4, num_layers=2)')
diff --git a/test/nn/models/test_molecule_gpt.py b/test/nn/models/test_molecule_gpt.py
new file mode 100644
index 000000000000..c9f0a53403ee
--- /dev/null
+++ b/test/nn/models/test_molecule_gpt.py
@@ -0,0 +1,60 @@
+import torch
+from torch.nn import Linear as Lin
+from torch.nn import ReLU
+from torch.nn import Sequential as Seq
+
+from torch_geometric.nn import GINEConv, MoleculeGPT
+from torch_geometric.nn.nlp import LLM, SentenceTransformer
+from torch_geometric.testing import onlyFullTest, withPackage
+
+
+@onlyFullTest
+@withPackage('transformers', 'sentencepiece', 'accelerate')
+def test_molecule_gpt() -> None:
+    llm = LLM(
+        # model_name='lmsys/vicuna-7b-v1.5',
+        model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+        num_params=1,
+        dtype=torch.bfloat16,
+    )
+
+    graph_encoder = GINEConv(nn=Seq(Lin(16, 16), ReLU(), Lin(16, 16)),
+                             train_eps=True, edge_dim=16)
+
+    smiles_encoder = SentenceTransformer(
+        model_name='DeepChem/ChemBERTa-77M-MTR',
+        pooling_strategy='last_hidden_state',
+    )
+
+    model = MoleculeGPT(
+        llm=llm,
+        graph_encoder=graph_encoder,
+        smiles_encoder=smiles_encoder,
+    )
+
+    assert str(model) == (
+        'MoleculeGPT(\n'
+        '  llm=LLM(TinyLlama/TinyLlama-1.1B-Chat-v0.1),\n'
+        '  graph=GINEConv,\n'
+        '  smiles=SentenceTransformer(model_name=DeepChem/ChemBERTa-77M-MTR),\n'  # noqa: E501
+        ')')
+
+    x = torch.randn(10, 16)
+    edge_index = torch.tensor([
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 0],
+    ])
+    edge_attr = torch.randn(edge_index.size(1), 16)
+    batch = torch.zeros(x.size(0), dtype=torch.long)
+    smiles = ['CCCCCCCCCC']
+    instructions = ['What is ∼ functional related to?']
+    label = ['I do not know!']
+
+    # Test train:
+    loss = model(x, edge_index, batch, edge_attr, smiles, instructions, label)
+    assert loss >= 0
+
+    # Test inference:
+    pred = model.inference(x, edge_index, batch, edge_attr, smiles,
+                           instructions)
+    assert len(pred) == 1
diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py
index 0b6569d3f92b..c086a85df779 100644
--- a/torch_geometric/datasets/__init__.py
+++ b/torch_geometric/datasets/__init__.py
@@ -77,6 +77,7 @@
 from .brca_tgca import BrcaTcga
 from .neurograph import NeuroGraphDataset
 from .web_qsp_dataset import WebQSPDataset
+from .molecule_gpt_dataset import MoleculeGPTDataset
 from .tag_dataset import TAGDataset
 
 from .dbp15k import DBP15K
@@ -191,6 +192,7 @@
     'BrcaTcga',
     'NeuroGraphDataset',
     'WebQSPDataset',
+    'MoleculeGPTDataset',
     'TAGDataset',
 ]
 
diff --git a/torch_geometric/datasets/molecule_gpt_dataset.py b/torch_geometric/datasets/molecule_gpt_dataset.py
new file mode 100644
index 000000000000..b1da09f38570
--- /dev/null
+++ b/torch_geometric/datasets/molecule_gpt_dataset.py
@@ -0,0 +1,480 @@
+import gzip
+import json
+import multiprocessing
+import os
+import sys
+from collections import defaultdict
+from multiprocessing import Pool
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
+import requests
+import torch
+from tqdm import tqdm
+
+from torch_geometric.data import Data, InMemoryDataset, download_url
+from torch_geometric.io import fs
+from torch_geometric.nn.nlp import LLM
+from torch_geometric.utils import one_hot
+
+
+def clean_up_description(description: str) -> str:
+    description = description + " "
+
+    # extra adj Pure
+    if description.startswith("Pure "):
+        description = description.replace("Pure ", "")
+    # fix typo
+    if description.startswith("Mercurycombines"):
+        description = description.replace("Mercurycombines",
+                                          "Mercury combines")
+
+    # a special case
+    description = description.replace(
+        "17-Hydroxy-6-methylpregna-3,6-diene-3,20-dione. ",
+        "17-Hydroxy-6-methylpregna-3,6-diene-3,20-dione is ")
+
+    # a special case
+    description = description.replace("5-Thymidylic acid. ",
+                                      "5-Thymidylic acid. is ")
+
+    # a special case
+    description = description.replace(
+        "5'-S-(3-Amino-3-carboxypropyl)-5'-thioadenosine. ",
+        "5'-S-(3-Amino-3-carboxypropyl)-5'-thioadenosine. is ")
+
+    # a special case
+    description = description.replace(
+        ("Guanosine 5'-(trihydrogen diphosphate), monoanhydride"
+         " with phosphorothioic acid. "),
+        ("Guanosine 5'-(trihydrogen diphosphate), monoanhydride"
+         " with phosphorothioic acid is "))
+
+    # a special case
+    description = description.replace("5'-Uridylic acid. ",
+                                      "5'-Uridylic acid is ")
+
+    # a special case
+    description = description.replace("5'-Adenylic acid, ",
+                                      "5'-Adenylic acid is ")
+
+    # a special case
+    description = description.replace(
+        "Uridine 5'-(tetrahydrogen triphosphate). ",
+        "Uridine 5'-(tetrahydrogen triphosphate). is ")
+
+    # a special case
+    description = description.replace("Inosine 5'-Monophosphate. ",
+                                      "Inosine 5'-Monophosphate. is ")
+
+    # a special case
+    description = description.replace("Pivaloyloxymethyl butyrate (AN-9), ",
+                                      "Pivaloyloxymethyl butyrate (AN-9) is ")
+
+    # a special case
+    description = description.replace(
+        "4-Amino-5-cyano-7-(D-ribofuranosyl)-7H- pyrrolo(2,3-d)pyrimidine. ",
+        "4-Amino-5-cyano-7-(D-ribofuranosyl)-7H- pyrrolo(2,3-d)pyrimidine is ")
+
+    # a special case
+    description = description.replace(
+        "Cardamonin (also known as Dihydroxymethoxychalcone), ",
+        "Cardamonin (also known as Dihydroxymethoxychalcone) is ")
+
+    # a special case
+    description = description.replace("Lithium has been used to treat ",
+                                      "Lithium is ")
+
+    # a special case
+    description = description.replace("4,4'-Methylenebis ",
+                                      "4,4'-Methylenebis is ")
+
+    # a special case
+    description = description.replace(
+        "2,3,7,8-Tetrachlorodibenzo-p-dioxin",
+        "2,3,7,8-Tetrachlorodibenzo-p-dioxin is ")
+
+    # a special case
+    description = description.replace("Exposure to 2,4,5-trichlorophenol ",
+                                      "2,4,5-Trichlorophenol exposure ")
+
+    index = 0
+    L = len(description)
+    if description.startswith('C.I. '):
+        start_index = len('C.I. ')
+    elif description.startswith('Nectriapyrone. D '):
+        start_index = len('Nectriapyrone. D ')
+    elif description.startswith(
+            'Salmonella enterica sv. Minnesota LPS core oligosaccharide'):
+        start_index = len(
+            'Salmonella enterica sv. Minnesota LPS core oligosaccharide')
+    else:
+        start_index = 0
+    for index in range(start_index, L - 1):
+        if index < L - 2:
+            if description[index] == '.' and description[
+                    index + 1] == ' ' and 'A' <= description[index + 2] <= 'Z':
+                break
+        elif index == L - 2:
+            break
+
+    first_sentence = description[:index + 1]
+    return first_sentence
+
+
+def extract_name(name_raw: str, description: str) -> Tuple[str, str, str]:
+    first_sentence = clean_up_description(description)
+
+    splitter = '  --  --  '
+    if ' are ' in first_sentence or ' were ' in first_sentence:
+        replaced_words = 'These molecules'
+    else:
+        replaced_words = 'This molecule'
+
+    first_sentence = first_sentence.replace(' is ', splitter)
+    first_sentence = first_sentence.replace(' are ', splitter)
+    first_sentence = first_sentence.replace(' was ', splitter)
+    first_sentence = first_sentence.replace(' were ', splitter)
+    first_sentence = first_sentence.replace(' appears ', splitter)
+    first_sentence = first_sentence.replace(' occurs ', splitter)
+    first_sentence = first_sentence.replace(' stands for ', splitter)
+    first_sentence = first_sentence.replace(' belongs to ', splitter)
+    first_sentence = first_sentence.replace(' exists ',
+                                            splitter)  # only for CID=11443
+    first_sentence = first_sentence.replace(' has been used in trials ',
+                                            splitter)
+    first_sentence = first_sentence.replace(' has been investigated ',
+                                            splitter)
+    first_sentence = first_sentence.replace(' has many uses ', splitter)
+
+    if splitter in first_sentence:
+        extracted_name = first_sentence.split(splitter, 1)[0]
+    elif first_sentence.startswith(name_raw):
+        extracted_name = name_raw
+    elif name_raw in first_sentence:
+        extracted_name = name_raw
+        extracted_name = None
+        print("=====", name_raw)
+        print("first sentence: ", first_sentence)
+    else:
+        extracted_name = None
+
+    if extracted_name is not None:
+        extracted_description = description.replace(extracted_name,
+                                                    replaced_words)
+    else:
+        extracted_description = description
+
+    return extracted_name, extracted_description, first_sentence
+
+
+class MoleculeGPTDataset(InMemoryDataset):
+    r"""The dataset from the `"MoleculeGPT: Instruction Following Large
+    Language Models for Molecular Property Prediction"
+    <https://ai4d3.github.io/papers/34.pdf>`_ paper.
+
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a transformed
+            version. The data object will be transformed before every access.
+            (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.Data` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+        pre_filter (callable, optional): A function that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a boolean
+            value, indicating whether the data object should be included in the
+            final dataset. (default: :obj:`None`)
+        force_reload (bool, optional): Whether to re-process the dataset.
+            (default: :obj:`False`)
+        total_page_num (int, optional): The number of pages from PubChem.
+            (default: :obj:`10`)
+        total_block_num (int, optional): The blocks of SDF files from PubChem.
+            (default: :obj:`1`)
+    """
+    description_url = (
+        'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/'
+        'heading/json?heading_type=Compound&heading=Record+Description&page={}'
+    )
+    compound_url = ('https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/'
+                    'CURRENT-Full/SDF')
+
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        pre_filter: Optional[Callable] = None,
+        force_reload: bool = False,
+        total_page_num: int = 10,
+        total_block_num: int = 1,
+    ):
+        self.total_page_num = total_page_num
+        self.total_block_num = total_block_num
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+        self.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['pubchem.csv']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return ['data.pt']
+
+    def download(self) -> None:
+        # Step 01. Extract description
+        step1_folder = f"{self.raw_dir}/step_01_PubChemSTM_description"
+        if not os.path.exists(step1_folder):
+            os.makedirs(step1_folder)
+            valid_CID_set = set()
+            CID2name_raw, CID2name_extracted = defaultdict(list), defaultdict(
+                list)
+            CID2text_raw, CID2text_extracted = defaultdict(list), defaultdict(
+                list)
+
+            for page_index in tqdm(range(self.total_page_num)):
+                page_num = page_index + 1
+                f_out = open(
+                    f"{step1_folder}/Compound_description_{page_num}.txt", "w")
+
+                description_data = requests.get(
+                    self.description_url.format(page_num)).json()
+
+                description_data = description_data["Annotations"]
+                assert description_data["Page"] == page_num
+
+                record_list = description_data["Annotation"]
+
+                for record in record_list:
+                    try:
+                        CID = record["LinkedRecords"]["CID"][0]
+                        if "Name" in record:
+                            name_raw = record["Name"]
+                            CID2name_raw[CID].append(name_raw)
+                        else:
+                            name_raw = None
+
+                        data_list = record["Data"]
+                        for data in data_list:
+                            description = data["Value"]["StringWithMarkup"][0][
+                                "String"].strip()
+
+                            extracted_name, extracted_description, _ = extract_name(  # noqa: E501
+                                name_raw, description)
+                            if extracted_name is not None:
+                                CID2name_extracted[CID].append(extracted_name)
+
+                            CID2text_raw[CID].append(description)
+                            CID2text_extracted[CID].append(
+                                extracted_description)
+
+                            valid_CID_set.add(CID)
+                            f_out.write(f"{CID}\n")
+                            f_out.write(f"{extracted_description}\n\n")
+                    except Exception:
+                        continue
+
+            valid_CID_list = sorted(list(valid_CID_set))
+            print(f"Total CID (with raw name) {len(CID2name_raw)}")
+            print(f"Total CID (with extracted name) {len(CID2name_extracted)}")
+            print(f"Total CID {len(valid_CID_list)}")
+
+            with open(f"{self.raw_dir}/CID2name_raw.json", "w") as f:
+                json.dump(CID2name_raw, f)
+
+            with open(f"{self.raw_dir}/CID2name.json", "w") as f:
+                json.dump(CID2name_extracted, f)
+
+            with open(f"{self.raw_dir}/CID2text_raw.json", "w") as f:
+                json.dump(CID2text_raw, f)
+
+            with open(f"{self.raw_dir}/CID2text.json", "w") as f:
+                json.dump(CID2text_extracted, f)
+
+        # Step 02. Download SDF Files
+        step2_folder = f"{self.raw_dir}/step_02_PubChemSTM_SDF"
+        if not os.path.exists(step2_folder):
+            for block_id in tqdm(range(self.total_block_num)):
+                block_size = 500000
+                l_id = block_id * block_size + 1
+                r_id = (block_id + 1) * block_size
+
+                compound_file_name = f"Compound_{l_id:09d}_{r_id:09d}.sdf.gz"
+                download_url(f"{self.compound_url}/{compound_file_name}",
+                             step2_folder)
+
+    def process(self, use_mp: bool = False) -> None:
+        try:
+            from rdkit import Chem
+            from rdkit.Chem.rdchem import BondType as BT
+            WITH_RDKIT = True
+
+        except ImportError:
+            WITH_RDKIT = False
+
+        if not WITH_RDKIT:
+            print(("Using a pre-processed version of the dataset. Please "
+                   "install 'rdkit' to alternatively process the raw data."),
+                  file=sys.stderr)
+
+            data_list = fs.torch_load(self.raw_paths[0])
+            data_list = [Data(**data_dict) for data_dict in data_list]
+
+            if self.pre_filter is not None:
+                data_list = [d for d in data_list if self.pre_filter(d)]
+
+            if self.pre_transform is not None:
+                data_list = [self.pre_transform(d) for d in data_list]
+
+            self.save(data_list, self.processed_paths[0])
+            return
+
+        # Step 03. Filter out SDF
+        step2_folder = f"{self.raw_dir}/step_02_PubChemSTM_SDF"
+        step3_folder = f"{self.raw_dir}/step_03_PubChemSTM_filtered"
+        if not os.path.exists(step3_folder):
+            os.makedirs(step3_folder)
+            with open(f"{self.raw_dir}/CID2text.json") as f:
+                CID2text = json.load(f)
+            target_CID_list = set(CID2text.keys())
+
+            block_size = 500000
+
+            def extract_one_SDF_file(block_id: int) -> None:
+                valid_mol_count = 0
+
+                writer = Chem.SDWriter(
+                    f'{step3_folder}/filtered_{block_id}.sdf')
+                l_id = block_id * block_size + 1
+                r_id = (block_id + 1) * block_size
+
+                compound_file_name = f"Compound_{l_id:09d}_{r_id:09d}.sdf.gz"
+                gzip_loader = gzip.open(f"{step2_folder}/{compound_file_name}")
+                suppl = Chem.ForwardSDMolSupplier(gzip_loader)
+
+                for mol in tqdm(suppl):
+                    if mol is None:
+                        continue
+                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
+
+                    if cid not in target_CID_list:
+                        continue
+
+                    writer.write(mol)
+                    valid_mol_count += 1
+
+                print(f"block id: {block_id}\nfound {valid_mol_count}\n\n")
+                sys.stdout.flush()
+                return
+
+            if use_mp:
+                num_process = multiprocessing.cpu_count()
+                print(f"{num_process} CPUs")
+                num_process = 8
+                p = Pool(num_process)
+
+                block_id_list = np.arange(self.total_block_num)
+                with p:
+                    p.map(extract_one_SDF_file, block_id_list)
+            else:
+                for block_id in range(self.total_block_num):
+                    extract_one_SDF_file(block_id)
+
+        # Step 04. Merge SDF
+        with open(f"{self.raw_dir}/CID2text.json") as f:
+            CID2text = json.load(f)
+        target_CID_list = set(CID2text.keys())
+        print(f'The length of target_CID_list: {len(target_CID_list)}')
+
+        writer = Chem.SDWriter(f'{self.raw_dir}/molecules.sdf')
+
+        found_CID_set = set()
+        for block_id in range(self.total_block_num + 1):
+            compound_file_path = f"{step3_folder}/filtered_{block_id}.sdf"
+            try:
+                suppl = Chem.SDMolSupplier(compound_file_path)
+
+                for mol in tqdm(suppl):
+                    writer.write(mol)
+                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                    found_CID_set.add(cid)
+            except Exception:
+                print(f"block id: {block_id} with 0 valid SDF file")
+                continue
+
+        print(f"In total: {len(found_CID_set)} molecules")
+
+        # Step 05. Convert to PyG data format
+        types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4, 'Unknow': 5}
+        bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}
+
+        data_list = []
+        # Real data
+        CID2text_file = f'{self.raw_dir}/CID2text.json'
+
+        with open(CID2text_file) as f:
+            CID2text_data = json.load(f)
+
+        suppl = Chem.SDMolSupplier(f'{self.raw_dir}/molecules.sdf')
+
+        llm = LLM(
+            # model_name='lmsys/vicuna-7b-v1.5',
+            model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+            num_params=1,
+            dtype=torch.bfloat16,
+        )
+        prompt = ("Propose a question regarding the molecule '∼' "
+                  "whose answer is: {}:")
+        for mol in tqdm(suppl):
+            if mol.HasProp('PUBCHEM_COMPOUND_CID'):
+                CID = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                CAN_SMILES = mol.GetProp("PUBCHEM_OPENEYE_CAN_SMILES")
+
+                m: Chem.Mol = Chem.MolFromSmiles(CAN_SMILES)
+                if m is None:
+                    continue
+                RDKit_CAN_SMILES = Chem.MolToSmiles(m)
+
+                ground_truth = CID2text_data[CID][0]
+
+                instruction = llm.inference([prompt.format(ground_truth)])[0]
+
+                x: torch.Tensor = torch.tensor([
+                    types[atom.GetSymbol()] if atom.GetSymbol() in types else 5
+                    for atom in m.GetAtoms()  # type: ignore
+                ])
+                x = one_hot(x, num_classes=len(types), dtype=torch.float)
+
+                rows, cols, edge_types = [], [], []
+                for bond in m.GetBonds():  # type: ignore
+                    i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                    edge_types += [bonds[bond.GetBondType()]] * 2
+                    rows += [i, j]
+                    cols += [j, i]
+
+                edge_index = torch.tensor([rows, cols], dtype=torch.long)
+                edge_type = torch.tensor(edge_types, dtype=torch.long)
+                edge_attr = one_hot(edge_type, num_classes=len(bonds))
+
+                data = Data(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    smiles=RDKit_CAN_SMILES,
+                    instruction=instruction,
+                    y=ground_truth,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+        self.save(data_list, self.processed_paths[0])
diff --git a/torch_geometric/nn/attention/__init__.py b/torch_geometric/nn/attention/__init__.py
index 947d5850173b..6b4064cd34b9 100644
--- a/torch_geometric/nn/attention/__init__.py
+++ b/torch_geometric/nn/attention/__init__.py
@@ -1,3 +1,7 @@
 from .performer import PerformerAttention
+from .qformer import QFormer
 
-__all__ = ['PerformerAttention']
+__all__ = [
+    'PerformerAttention',
+    'QFormer',
+]
diff --git a/torch_geometric/nn/attention/qformer.py b/torch_geometric/nn/attention/qformer.py
new file mode 100644
index 000000000000..3a8f512d3f83
--- /dev/null
+++ b/torch_geometric/nn/attention/qformer.py
@@ -0,0 +1,71 @@
+from typing import Callable
+
+import torch
+
+
+class QFormer(torch.nn.Module):
+    r"""The Querying Transformer (Q-Former) from
+    `"BLIP-2: Bootstrapping Language-Image Pre-training
+    with Frozen Image Encoders and Large Language Models"
+    <https://arxiv.org/pdf/2301.12597>`_ paper.
+
+    Args:
+        input_dim (int): The number of features in the input.
+        hidden_dim (int): The dimension of the fnn in the encoder layer.
+        output_dim (int): The final output dimension.
+        num_heads (int): The number of multi-attention-heads.
+        num_layers (int): The number of sub-encoder-layers in the encoder.
+        dropout (int): The dropout value in each encoder layer.
+
+
+    .. note::
+        This is a simplified version of the original Q-Former implementation.
+    """
+    def __init__(
+            self,
+            input_dim: int,
+            hidden_dim: int,
+            output_dim: int,
+            num_heads: int,
+            num_layers: int,
+            dropout: float = 0.0,
+            activation: Callable = torch.nn.ReLU(),
+    ) -> None:
+
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+
+        self.layer_norm = torch.nn.LayerNorm(input_dim)
+        self.encoder_layer = torch.nn.TransformerEncoderLayer(
+            d_model=input_dim,
+            nhead=num_heads,
+            dim_feedforward=hidden_dim,
+            dropout=dropout,
+            activation=activation,
+            batch_first=True,
+        )
+        self.encoder = torch.nn.TransformerEncoder(
+            self.encoder_layer,
+            num_layers=num_layers,
+        )
+        self.project = torch.nn.Linear(input_dim, output_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Forward pass.
+
+        Args:
+            x (torch.Tensor): Input sequence to the encoder layer.
+                :math:`\mathbf{X} \in \mathbb{R}^{B \times N \times F}`, with
+                batch-size :math:`B`, sequence length :math:`N`,
+                and feature dimension :math:`F`.
+        """
+        x = self.layer_norm(x)
+        x = self.encoder(x)
+        out = self.project(x)
+        return out
+
+    def __repr__(self) -> str:
+        return (f'{self.__class__.__name__}('
+                f'num_heads={self.num_heads}, '
+                f'num_layers={self.num_layers})')
diff --git a/torch_geometric/nn/models/__init__.py b/torch_geometric/nn/models/__init__.py
index 5860db311ac3..9aeac020264a 100644
--- a/torch_geometric/nn/models/__init__.py
+++ b/torch_geometric/nn/models/__init__.py
@@ -29,6 +29,7 @@
 from .neural_fingerprint import NeuralFingerprint
 from .visnet import ViSNet
 from .g_retriever import GRetriever
+from .molecule_gpt import MoleculeGPT
 from .glem import GLEM
 # Deprecated:
 from torch_geometric.explain.algorithm.captum import (to_captum_input,
@@ -77,5 +78,6 @@
     'NeuralFingerprint',
     'ViSNet',
     'GRetriever',
+    'MoleculeGPT',
     'GLEM',
 ]
diff --git a/torch_geometric/nn/models/molecule_gpt.py b/torch_geometric/nn/models/molecule_gpt.py
new file mode 100644
index 000000000000..a0ac73ad9abb
--- /dev/null
+++ b/torch_geometric/nn/models/molecule_gpt.py
@@ -0,0 +1,222 @@
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+from torch_geometric.nn.attention import QFormer
+from torch_geometric.nn.nlp.llm import BOS, LLM, MAX_NEW_TOKENS
+from torch_geometric.utils import to_dense_batch
+
+
+def pad_or_truncate(embeddings: Tensor, max_seq_len: int,
+                    padding_value: int = 0) -> Tensor:
+    batch_size, current_seq_len, d = embeddings.size()
+
+    if current_seq_len > max_seq_len:
+        return embeddings[:, :max_seq_len, :]
+    elif current_seq_len < max_seq_len:
+        pad_tensor = torch.full((batch_size, max_seq_len - current_seq_len, d),
+                                padding_value, dtype=embeddings.dtype,
+                                device=embeddings.device)
+        return torch.cat([embeddings, pad_tensor], dim=1)
+    else:
+        return embeddings
+
+
+class MoleculeGPT(torch.nn.Module):
+    r"""The MoleculeGPT model from the `"MoleculeGPT: Instruction
+    Following Large Language Models for Molecular Property Prediction"
+    <https://ai4d3.github.io/papers/34.pdf>`_ paper.
+
+    Args:
+        llm (LLM): The LLM to use.
+        graph_encoder (torch.nn.Module): Encode 2D molecule graph.
+        smiles_encoder (torch.nn.Module): Encode 1D SMILES.
+        mlp_out_channels (int, optional): The size of each embedding
+            after qformer encoding. (default: :obj:`32`)
+        max_tokens (int, optional): Max output tokens of 1D/2D encoder.
+            (default: :obj:`20`)
+
+    .. warning::
+        This module has been tested with the following HuggingFace models
+
+        * :obj:`llm_to_use="lmsys/vicuna-7b-v1.5"`
+
+        and may not work with other models. See other models at `HuggingFace
+        Models <https://huggingface.co/models>`_ and let us know if you
+        encounter any issues.
+
+    .. note::
+        For an example of using :class:`MoleculeGPT`, see
+        `examples/llm/molecule_gpt.py <https://github.com/pyg-team/
+        pytorch_geometric/blob/master/examples/llm/molecule_gpt.py>`_.
+    """
+    def __init__(
+        self,
+        llm: LLM,
+        graph_encoder: torch.nn.Module,
+        smiles_encoder: torch.nn.Module,
+        mlp_out_channels: int = 32,
+        max_tokens: Optional[int] = 20,
+    ) -> None:
+        super().__init__()
+        self.llm = llm
+        self.graph_encoder = graph_encoder.to(self.llm.device)
+        self.smiles_encoder = smiles_encoder.to(self.llm.device)
+
+        self.graph_qformer = QFormer(
+            input_dim=self.graph_encoder.nn[-1].out_features,
+            hidden_dim=mlp_out_channels,
+            output_dim=mlp_out_channels,
+            num_heads=4,
+            num_layers=2,
+        ).to(self.llm.device)
+
+        self.smiles_qformer = QFormer(
+            input_dim=self.smiles_encoder.model.pooler.dense.out_features,
+            hidden_dim=mlp_out_channels,
+            output_dim=mlp_out_channels,
+            num_heads=4,
+            num_layers=2,
+        ).to(self.llm.device)
+
+        self.max_tokens = max_tokens
+
+        self.word_embedding = self.llm.word_embedding
+        self.llm_generator = self.llm.llm
+
+        # LLMs
+        in_dim = 2 * mlp_out_channels * max_tokens
+        out_dim = self.llm.llm.model.embed_tokens.embedding_dim
+        self.projector = torch.nn.Sequential(
+            torch.nn.Linear(in_dim, in_dim),
+            torch.nn.Sigmoid(),
+            torch.nn.Linear(in_dim, out_dim),
+        ).to(self.llm.device)
+
+    def encode(
+        self,
+        x: Tensor,
+        edge_index: Tensor,
+        batch: Tensor,
+        edge_attr: Optional[Tensor],
+        smiles: List[str],
+    ) -> Tensor:
+        batch_size = len(smiles)
+        # 2D Graph Branch: [bs, node_len, d]
+        x = x.to(self.llm.device)
+        edge_index = edge_index.to(self.llm.device)
+        if edge_attr is not None:
+            edge_attr = edge_attr.to(self.llm.device)
+        batch = batch.to(self.llm.device)
+
+        x_graph = self.graph_encoder(x, edge_index, edge_attr=edge_attr)
+        x_graph = to_dense_batch(x_graph, batch)[0]
+        out_graph = self.graph_qformer(x_graph)
+        out_graph = pad_or_truncate(out_graph, max_seq_len=self.max_tokens,
+                                    padding_value=0)
+        out_graph = out_graph.view(batch_size, -1)
+
+        # 1D SMILES Branch: [bs, seq_len, d]
+        x_smiles = self.smiles_encoder.encode(smiles,
+                                              output_device=self.llm.device)
+        out_smiles = self.smiles_qformer(x_smiles)
+        out_smiles = pad_or_truncate(out_smiles, max_seq_len=self.max_tokens,
+                                     padding_value=0)
+        out_smiles = out_smiles.view(batch_size, -1)
+
+        # Merge into LLMs
+        x_cat = torch.cat([out_graph, out_smiles], dim=1)
+        return x_cat
+
+    def forward(
+        self,
+        x: Tensor,
+        edge_index: Tensor,
+        batch: Tensor,
+        edge_attr: Optional[Tensor],
+        smiles: List[str],
+        instructions: List[str],
+        label: List[str],
+        additional_text_context: Optional[List[str]] = None,
+    ):
+        x = self.encode(x, edge_index, batch, edge_attr, smiles)
+        x = self.projector(x)
+        xs = x.split(1, dim=0)
+
+        batch_unique = batch.unique()
+        batch_size = len(instructions)
+        if len(batch_unique) < batch_size:
+            xs = [
+                xs[i] if i in batch_unique else None for i in range(batch_size)
+            ]
+
+        (
+            inputs_embeds,
+            attention_mask,
+            label_input_ids,
+        ) = self.llm._get_embeds(instructions, additional_text_context, xs,
+                                 label)
+
+        with self.llm.autocast_context:
+            outputs = self.llm_generator(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                return_dict=True,
+                labels=label_input_ids,
+            )
+
+        return outputs.loss
+
+    @torch.no_grad()
+    def inference(
+        self,
+        x: Tensor,
+        edge_index: Tensor,
+        batch: Tensor,
+        edge_attr: Optional[Tensor],
+        smiles: List[str],
+        instructions: List[str],
+        additional_text_context: Optional[List[str]] = None,
+        max_out_tokens: Optional[int] = MAX_NEW_TOKENS,
+    ):
+        x = self.encode(x, edge_index, batch, edge_attr, smiles)
+        x = self.projector(x)
+        xs = x.split(1, dim=0)
+
+        # Handle questions without node features:
+        batch_unique = batch.unique()
+        batch_size = len(instructions)
+        if len(batch_unique) < batch_size:
+            xs = [
+                xs[i] if i in batch_unique else None for i in range(batch_size)
+            ]
+
+        inputs_embeds, attention_mask, _ = self.llm._get_embeds(
+            instructions, additional_text_context, xs)
+
+        bos_token = self.llm.tokenizer(
+            BOS,
+            add_special_tokens=False,
+        ).input_ids[0]
+
+        with self.llm.autocast_context:
+            outputs = self.llm_generator.generate(
+                inputs_embeds=inputs_embeds,
+                max_new_tokens=max_out_tokens,
+                attention_mask=attention_mask,
+                bos_token_id=bos_token,
+                use_cache=True  # Important to set!
+            )
+
+        return self.llm.tokenizer.batch_decode(
+            outputs,
+            skip_special_tokens=True,
+        )
+
+    def __repr__(self) -> str:
+        return (f'{self.__class__.__name__}(\n'
+                f'  llm={self.llm},\n'
+                f'  graph={self.graph_encoder.__class__.__name__},\n'
+                f'  smiles={self.smiles_encoder},\n'
+                f')')
diff --git a/torch_geometric/nn/nlp/llm.py b/torch_geometric/nn/nlp/llm.py
index b58059f8e098..d18aa42382f7 100644
--- a/torch_geometric/nn/nlp/llm.py
+++ b/torch_geometric/nn/nlp/llm.py
@@ -56,7 +56,7 @@ class LLM(torch.nn.Module):
         allocate the correct number of GPUs needed, given the available GPU
         memory of your GPUs.
     dtype (torch.dtype, optional): The data type to use for the LLM.
-        (default :obj: `torch.bloat16`)
+        (default :obj: `torch.bfloat16`)
     """
     def __init__(
         self,
diff --git a/torch_geometric/nn/nlp/sentence_transformer.py b/torch_geometric/nn/nlp/sentence_transformer.py
index c66677e8fa24..715f343bfc19 100644
--- a/torch_geometric/nn/nlp/sentence_transformer.py
+++ b/torch_geometric/nn/nlp/sentence_transformer.py
@@ -10,6 +10,7 @@ class PoolingStrategy(Enum):
     MEAN = 'mean'
     LAST = 'last'
     CLS = 'cls'
+    LAST_HIDDEN_STATE = 'last_hidden_state'
 
 
 class SentenceTransformer(torch.nn.Module):
@@ -38,6 +39,8 @@ def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
             emb = mean_pooling(emb, attention_mask)
         elif self.pooling_strategy == PoolingStrategy.LAST:
             emb = last_pooling(emb, attention_mask)
+        elif self.pooling_strategy == PoolingStrategy.LAST_HIDDEN_STATE:
+            emb = out.last_hidden_state
         else:
             assert self.pooling_strategy == PoolingStrategy.CLS
             emb = emb[:, 0, :]

From 83485e3cad23c8e93e399314a305463d2bc94c52 Mon Sep 17 00:00:00 2001
From: Rishi Puri <puririshi98@berkeley.edu>
Date: Wed, 20 Nov 2024 09:36:53 -0800
Subject: [PATCH 04/10] Add comment in `g_retriever.py` pointing to `Neo4j`
 Graph DB integration demo  (#9797)

---
 CHANGELOG.md                | 1 +
 examples/llm/g_retriever.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9240420208bb..d7f1db3c8748 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added comment in `g_retriever.py` pointing to `Neo4j` Graph DB integration demo ([#9748](https://github.com/pyg-team/pytorch_geometric/pull/9797))
 - Added `MoleculeGPT` example ([#9710](https://github.com/pyg-team/pytorch_geometric/pull/9710))
 - Added `nn.models.GLEM` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
 - Added `TAGDataset` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
diff --git a/examples/llm/g_retriever.py b/examples/llm/g_retriever.py
index 1735d17f5249..984ce3f010e7 100644
--- a/examples/llm/g_retriever.py
+++ b/examples/llm/g_retriever.py
@@ -6,6 +6,9 @@
 
 Requirements:
 `pip install datasets transformers pcst_fast sentencepiece accelerate`
+
+Example repo for integration with Neo4j Graph DB:
+https://github.com/neo4j-product-examples/neo4j-gnn-llm-example
 """
 import argparse
 import math

From f73242708262640c644c539e4eae6ea30ad3f477 Mon Sep 17 00:00:00 2001
From: xnuohz <xnuohz@126.com>
Date: Mon, 25 Nov 2024 13:39:08 +0800
Subject: [PATCH 05/10] Add GIT-Mol (#9730)

### Issue
- #9694
- https://github.com/pyg-team/pytorch_geometric/issues/9700

### Feature Summary

- Add `GitMolDataset`
- Add `GITMol` as GNN & LLM Co-training model to PyG
- Add an example for pre-training
- Limited hardware resources, so the full training pipeline was not
tested
- Multi modal cross attention shares the same weight, not aligned with
the original paper

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Rishi Puri <puririshi98@berkeley.edu>
---
 CHANGELOG.md                                  |   1 +
 examples/llm/README.md                        |   1 +
 examples/llm/git_mol.py                       | 133 +++++++
 test/datasets/test_git_mol_dataset.py         |  19 +
 test/nn/models/test_git_mol.py                |  24 ++
 test/nn/nlp/test_vision_transformer.py        |  26 ++
 torch_geometric/datasets/__init__.py          |   2 +
 torch_geometric/datasets/git_mol_dataset.py   | 263 ++++++++++++++
 torch_geometric/nn/models/__init__.py         |   2 +
 torch_geometric/nn/models/git_mol.py          | 336 ++++++++++++++++++
 torch_geometric/nn/nlp/__init__.py            |   2 +
 .../nn/nlp/sentence_transformer.py            |  30 ++
 torch_geometric/nn/nlp/vision_transformer.py  |  33 ++
 13 files changed, 872 insertions(+)
 create mode 100644 examples/llm/git_mol.py
 create mode 100644 test/datasets/test_git_mol_dataset.py
 create mode 100644 test/nn/models/test_git_mol.py
 create mode 100644 test/nn/nlp/test_vision_transformer.py
 create mode 100644 torch_geometric/datasets/git_mol_dataset.py
 create mode 100644 torch_geometric/nn/models/git_mol.py
 create mode 100644 torch_geometric/nn/nlp/vision_transformer.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7f1db3c8748..86e06d8835e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `GIT-Mol` ([#9730](https://github.com/pyg-team/pytorch_geometric/pull/9730))
 - Added comment in `g_retriever.py` pointing to `Neo4j` Graph DB integration demo ([#9748](https://github.com/pyg-team/pytorch_geometric/pull/9797))
 - Added `MoleculeGPT` example ([#9710](https://github.com/pyg-team/pytorch_geometric/pull/9710))
 - Added `nn.models.GLEM` ([#9662](https://github.com/pyg-team/pytorch_geometric/pull/9662))
diff --git a/examples/llm/README.md b/examples/llm/README.md
index d860232aa56b..eb471563de8e 100644
--- a/examples/llm/README.md
+++ b/examples/llm/README.md
@@ -3,5 +3,6 @@
 | Example                                | Description                                                                                                                                                                                    |
 | -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [`g_retriever.py`](./g_retriever.py)   | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information                                    |
+| [`git_mol.py`](./git_mol.py)           | Example for GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text                                                                                      |
 | [`molecule_gpt.py`](./molecule_gpt.py) | Example for MoleculeGPT: Instruction Following Large Language Models for Molecular Property Prediction                                                                                         |
 | [`glem.py`](./glem.py)                 | Example for [GLEM](https://arxiv.org/abs/2210.14709), a GNN+LLM co-training model via variational Expectation-Maximization (EM) framework on node classification tasks to achieve SOTA results |
diff --git a/examples/llm/git_mol.py b/examples/llm/git_mol.py
new file mode 100644
index 000000000000..d05104db050c
--- /dev/null
+++ b/examples/llm/git_mol.py
@@ -0,0 +1,133 @@
+"""This example implements the GIT-Mol model
+(https://arxiv.org/abs/2308.06911) using PyG.
+"""
+import argparse
+import os.path as osp
+
+import torch
+from accelerate import Accelerator
+from torch.optim.lr_scheduler import StepLR
+from tqdm import tqdm
+
+from torch_geometric import seed_everything
+from torch_geometric.datasets import GitMolDataset
+from torch_geometric.loader import DataLoader
+from torch_geometric.nn.models import GITMol
+
+
+@torch.no_grad()
+def eval(model, data_loader):
+    model.eval()
+    loss = 0
+
+    for batch in data_loader:
+        batch_loss = model(batch.x, batch.edge_index, batch.batch,
+                           batch.edge_attr, batch.smiles, batch.image,
+                           batch.caption)
+        loss += batch_loss.item() / len(data_loader)
+    return loss
+
+
+def train(
+    num_epochs: int,
+    lr: float,
+    weight_decay: float,
+    batch_size: int,
+    checkpointing: bool,
+):
+    # Load dataset ================================================
+    path = osp.dirname(osp.realpath(__file__))
+    path = osp.join(path, '..', '..', 'data', 'GITMol')
+    train_dataset = GitMolDataset(path, split=0)
+    val_dataset = GitMolDataset(path, split=1)
+    test_dataset = GitMolDataset(path, split=2)
+
+    seed_everything(42)
+
+    train_loader = DataLoader(train_dataset, batch_size=batch_size,
+                              drop_last=True, pin_memory=True, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size,
+                            drop_last=False, pin_memory=True, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size,
+                             drop_last=False, pin_memory=True, shuffle=False)
+
+    # Create model ===============================================
+    accelerator = Accelerator()
+    device = accelerator.device
+    model = GITMol().to(device)
+    optimizer = torch.optim.AdamW(
+        [p for p in model.parameters() if p.requires_grad], lr=lr,
+        weight_decay=weight_decay)
+    scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
+    model, optimizer, train_loader, scheduler = accelerator.prepare(
+        model, optimizer, train_loader, scheduler)
+    val_loader = accelerator.prepare_data_loader(val_loader,
+                                                 device_placement=True)
+    test_loader = accelerator.prepare_data_loader(test_loader,
+                                                  device_placement=True)
+
+    # Train and eval ============================================
+    best_epoch = 0
+    best_val_loss = float('inf')
+    for epoch in range(num_epochs):
+        # Train
+        model.train()
+        epoch_loss = 0
+        if epoch == 0:
+            print("Training beginning...")
+        epoch_str = f'Epoch: {epoch + 1}|{num_epochs}'
+
+        for batch in tqdm(train_loader, desc=epoch_str):
+            optimizer.zero_grad()
+            loss = model(batch.x, batch.edge_index, batch.batch,
+                         batch.edge_attr, batch.smiles, batch.image,
+                         batch.caption)
+            accelerator.backward(loss)
+
+            optimizer.step()
+            epoch_loss += loss.item()
+
+        train_loss = epoch_loss / len(train_loader)
+
+        # Eval
+        val_loss = eval(model, val_loader)
+        print(
+            f'{epoch_str}, Train loss: {train_loss:4f}, Val loss: {val_loss:4f}'  # noqa: E501
+        )
+
+        if checkpointing and val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_epoch = epoch
+            torch.save(
+                {
+                    'model_state_dict':
+                    accelerator.unwrap_model(model).state_dict(),
+                    'best_loss':
+                    best_val_loss
+                },
+                f'gitmol_pretrain_epoch{best_epoch}_val_loss{best_val_loss:4f}_ckpt.pt'  # noqa: E501
+            )
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+
+    # Test
+    test_loss = eval(model, test_loader)
+    print(f'Test loss: {test_loss:4f}')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=3)
+    parser.add_argument('--lr', type=float, default=1e-5)
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument('--checkpointing', type=bool, default=True)
+    args = parser.parse_args()
+
+    train(
+        args.epochs,
+        args.lr,
+        args.weight_decay,
+        args.batch_size,
+        args.checkpointing,
+    )
diff --git a/test/datasets/test_git_mol_dataset.py b/test/datasets/test_git_mol_dataset.py
new file mode 100644
index 000000000000..3da72f2f2182
--- /dev/null
+++ b/test/datasets/test_git_mol_dataset.py
@@ -0,0 +1,19 @@
+import pytest
+
+from torch_geometric.datasets import GitMolDataset
+from torch_geometric.testing import withPackage
+
+
+@withPackage('torchvision', 'rdkit', 'PIL')
+@pytest.mark.parametrize('split', [
+    (0, 3610),
+    (1, 451),
+    (2, 451),
+])
+def test_git_mol_dataset(split):
+    dataset = GitMolDataset(root='./data/GITMol', split=split[0])
+
+    assert len(dataset) == split[1]
+    assert dataset[0].image.size() == (1, 3, 224, 224)
+    assert dataset[0].num_node_features == 9
+    assert dataset[0].num_edge_features == 3
diff --git a/test/nn/models/test_git_mol.py b/test/nn/models/test_git_mol.py
new file mode 100644
index 000000000000..ee557bfaa9fc
--- /dev/null
+++ b/test/nn/models/test_git_mol.py
@@ -0,0 +1,24 @@
+import torch
+
+from torch_geometric.nn.models import GITMol
+from torch_geometric.testing import withPackage
+
+
+@withPackage('transformers', 'sentencepiece', 'accelerate')
+def test_git_mol():
+    model = GITMol()
+
+    x = torch.ones(10, 16, dtype=torch.long)
+    edge_index = torch.tensor([
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
+    ])
+    edge_attr = torch.zeros(edge_index.size(1), 16, dtype=torch.long)
+    batch = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    smiles = ['CC(C)([C@H]1CC2=C(O1)C=CC3=C2OC(=O)C=C3)O']
+    captions = ['The molecule is the (R)-(-)-enantiomer of columbianetin.']
+    images = torch.randn(1, 3, 224, 224)
+
+    # Test train:
+    loss = model(x, edge_index, batch, edge_attr, smiles, images, captions)
+    assert loss >= 0
diff --git a/test/nn/nlp/test_vision_transformer.py b/test/nn/nlp/test_vision_transformer.py
new file mode 100644
index 000000000000..7500ebc7fd0e
--- /dev/null
+++ b/test/nn/nlp/test_vision_transformer.py
@@ -0,0 +1,26 @@
+import torch
+
+from torch_geometric.nn.nlp import VisionTransformer
+from torch_geometric.testing import onlyFullTest, withCUDA, withPackage
+
+
+@withCUDA
+@onlyFullTest
+@withPackage('transformers')
+def test_vision_transformer(device):
+    model = VisionTransformer(
+        model_name='microsoft/swin-base-patch4-window7-224', ).to(device)
+    assert model.device == device
+    assert str(
+        model
+    ) == 'VisionTransformer(model_name=microsoft/swin-base-patch4-window7-224)'
+
+    images = torch.randn(2, 3, 224, 224).to(device)
+
+    out = model(images)
+    assert out.device == device
+    assert out.size() == (2, 49, 1024)
+
+    out = model(images, output_device='cpu')
+    assert out.is_cpu
+    assert out.size() == (2, 49, 1024)
diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py
index c086a85df779..12895ad1dbac 100644
--- a/torch_geometric/datasets/__init__.py
+++ b/torch_geometric/datasets/__init__.py
@@ -77,6 +77,7 @@
 from .brca_tgca import BrcaTcga
 from .neurograph import NeuroGraphDataset
 from .web_qsp_dataset import WebQSPDataset
+from .git_mol_dataset import GitMolDataset
 from .molecule_gpt_dataset import MoleculeGPTDataset
 from .tag_dataset import TAGDataset
 
@@ -192,6 +193,7 @@
     'BrcaTcga',
     'NeuroGraphDataset',
     'WebQSPDataset',
+    'GitMolDataset',
     'MoleculeGPTDataset',
     'TAGDataset',
 ]
diff --git a/torch_geometric/datasets/git_mol_dataset.py b/torch_geometric/datasets/git_mol_dataset.py
new file mode 100644
index 000000000000..4b7cfa78117c
--- /dev/null
+++ b/torch_geometric/datasets/git_mol_dataset.py
@@ -0,0 +1,263 @@
+import sys
+from typing import Any, Callable, Dict, List, Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from torch_geometric.data import (
+    Data,
+    InMemoryDataset,
+    download_google_url,
+    extract_zip,
+)
+from torch_geometric.io import fs
+
+
+def safe_index(lst: List[Any], e: int) -> int:
+    return lst.index(e) if e in lst else len(lst) - 1
+
+
+class GitMolDataset(InMemoryDataset):
+    r"""The dataset from the `"GIT-Mol: A Multi-modal Large Language Model
+    for Molecular Science with Graph, Image, and Text"
+    <https://arxiv.org/pdf/2308.06911>`_ paper.
+
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a transformed
+            version. The data object will be transformed before every access.
+            (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.Data` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+        pre_filter (callable, optional): A function that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a boolean
+            value, indicating whether the data object should be included in the
+            final dataset. (default: :obj:`None`)
+        force_reload (bool, optional): Whether to re-process the dataset.
+            (default: :obj:`False`)
+        split (int, optional): Datasets split, train/valid/test=0/1/2.
+            (default: :obj:`0`)
+    """
+
+    raw_url_id = '1loBXabD6ncAFY-vanRsVtRUSFkEtBweg'
+
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        pre_filter: Optional[Callable] = None,
+        force_reload: bool = False,
+        split: int = 0,
+    ):
+        from torchvision import transforms
+
+        self.split = split
+
+        if self.split == 0:
+            self.img_transform = transforms.Compose([
+                transforms.Resize((224, 224)),
+                transforms.RandomRotation(15),
+                transforms.ColorJitter(brightness=0.5, contrast=0.5, hue=0.5),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            self.img_transform = transforms.Compose([
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+            ])
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        self.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['train_3500.pkl', 'valid_450.pkl', 'test_450.pkl']
+
+    @property
+    def processed_file_names(self) -> str:
+        return ['train.pt', 'valid.pt', 'test.pt'][self.split]
+
+    def download(self) -> None:
+        file_path = download_google_url(
+            self.raw_url_id,
+            self.raw_dir,
+            'gitmol.zip',
+        )
+        extract_zip(file_path, self.raw_dir)
+
+    def process(self) -> None:
+        import pandas as pd
+        from PIL import Image
+
+        try:
+            from rdkit import Chem, RDLogger
+            RDLogger.DisableLog('rdApp.*')  # type: ignore
+            WITH_RDKIT = True
+
+        except ImportError:
+            WITH_RDKIT = False
+
+        if not WITH_RDKIT:
+            print(("Using a pre-processed version of the dataset. Please "
+                   "install 'rdkit' to alternatively process the raw data."),
+                  file=sys.stderr)
+
+            data_list = fs.torch_load(self.raw_paths[0])
+            data_list = [Data(**data_dict) for data_dict in data_list]
+
+            if self.pre_filter is not None:
+                data_list = [d for d in data_list if self.pre_filter(d)]
+
+            if self.pre_transform is not None:
+                data_list = [self.pre_transform(d) for d in data_list]
+
+            self.save(data_list, self.processed_paths[0])
+            return
+
+        allowable_features: Dict[str, List[Any]] = {
+            'possible_atomic_num_list':
+            list(range(1, 119)) + ['misc'],
+            'possible_formal_charge_list':
+            [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 'misc'],
+            'possible_chirality_list': [
+                Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+                Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+                Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+                Chem.rdchem.ChiralType.CHI_OTHER
+            ],
+            'possible_hybridization_list': [
+                Chem.rdchem.HybridizationType.SP,
+                Chem.rdchem.HybridizationType.SP2,
+                Chem.rdchem.HybridizationType.SP3,
+                Chem.rdchem.HybridizationType.SP3D,
+                Chem.rdchem.HybridizationType.SP3D2,
+                Chem.rdchem.HybridizationType.UNSPECIFIED, 'misc'
+            ],
+            'possible_numH_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 'misc'],
+            'possible_implicit_valence_list': [0, 1, 2, 3, 4, 5, 6],
+            'possible_degree_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'misc'],
+            'possible_number_radical_e_list': [0, 1, 2, 3, 4, 'misc'],
+            'possible_is_aromatic_list': [False, True],
+            'possible_is_in_ring_list': [False, True],
+            'possible_bond_type_list': [
+                Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
+                Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC,
+                Chem.rdchem.BondType.ZERO
+            ],
+            'possible_bond_dirs': [  # only for double bond stereo information
+                Chem.rdchem.BondDir.NONE, Chem.rdchem.BondDir.ENDUPRIGHT,
+                Chem.rdchem.BondDir.ENDDOWNRIGHT
+            ],
+            'possible_bond_stereo_list': [
+                Chem.rdchem.BondStereo.STEREONONE,
+                Chem.rdchem.BondStereo.STEREOZ,
+                Chem.rdchem.BondStereo.STEREOE,
+                Chem.rdchem.BondStereo.STEREOCIS,
+                Chem.rdchem.BondStereo.STEREOTRANS,
+                Chem.rdchem.BondStereo.STEREOANY,
+            ],
+            'possible_is_conjugated_list': [False, True]
+        }
+
+        data = pd.read_pickle(
+            f'{self.raw_dir}/igcdata_toy/{self.raw_file_names[self.split]}')
+
+        data_list = []
+        for _, r in tqdm(data.iterrows(), total=data.shape[0]):
+            smiles = r['isosmiles']
+            mol = Chem.MolFromSmiles(smiles.strip('\n'))
+            if mol is not None:
+                # text
+                summary = r['summary']
+                # image
+                cid = r['cid']
+                img_file = f'{self.raw_dir}/igcdata_toy/imgs/CID_{cid}.png'
+                img = Image.open(img_file).convert('RGB')
+                img = self.img_transform(img).unsqueeze(0)
+                # graph
+                atom_features_list = []
+                for atom in mol.GetAtoms():  # type: ignore
+                    atom_feature = [
+                        safe_index(
+                            allowable_features['possible_atomic_num_list'],
+                            atom.GetAtomicNum()),
+                        allowable_features['possible_chirality_list'].index(
+                            atom.GetChiralTag()),
+                        safe_index(allowable_features['possible_degree_list'],
+                                   atom.GetTotalDegree()),
+                        safe_index(
+                            allowable_features['possible_formal_charge_list'],
+                            atom.GetFormalCharge()),
+                        safe_index(allowable_features['possible_numH_list'],
+                                   atom.GetTotalNumHs()),
+                        safe_index(
+                            allowable_features[
+                                'possible_number_radical_e_list'],
+                            atom.GetNumRadicalElectrons()),
+                        safe_index(
+                            allowable_features['possible_hybridization_list'],
+                            atom.GetHybridization()),
+                        allowable_features['possible_is_aromatic_list'].index(
+                            atom.GetIsAromatic()),
+                        allowable_features['possible_is_in_ring_list'].index(
+                            atom.IsInRing()),
+                    ]
+                    atom_features_list.append(atom_feature)
+                x = torch.tensor(np.array(atom_features_list),
+                                 dtype=torch.long)
+
+                edges_list = []
+                edge_features_list = []
+                for bond in mol.GetBonds():  # type: ignore
+                    i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                    edge_feature = [
+                        safe_index(
+                            allowable_features['possible_bond_type_list'],
+                            bond.GetBondType()),
+                        allowable_features['possible_bond_stereo_list'].index(
+                            bond.GetStereo()),
+                        allowable_features['possible_is_conjugated_list'].
+                        index(bond.GetIsConjugated()),
+                    ]
+                    edges_list.append((i, j))
+                    edge_features_list.append(edge_feature)
+                    edges_list.append((j, i))
+                    edge_features_list.append(edge_feature)
+
+                edge_index = torch.tensor(
+                    np.array(edges_list).T,
+                    dtype=torch.long,
+                )
+                edge_attr = torch.tensor(
+                    np.array(edge_features_list),
+                    dtype=torch.long,
+                )
+
+                data = Data(
+                    x=x,
+                    edge_index=edge_index,
+                    smiles=smiles,
+                    edge_attr=edge_attr,
+                    image=img,
+                    caption=summary,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+        self.save(data_list, self.processed_paths[0])
diff --git a/torch_geometric/nn/models/__init__.py b/torch_geometric/nn/models/__init__.py
index 9aeac020264a..9ade58cebc05 100644
--- a/torch_geometric/nn/models/__init__.py
+++ b/torch_geometric/nn/models/__init__.py
@@ -29,6 +29,7 @@
 from .neural_fingerprint import NeuralFingerprint
 from .visnet import ViSNet
 from .g_retriever import GRetriever
+from .git_mol import GITMol
 from .molecule_gpt import MoleculeGPT
 from .glem import GLEM
 # Deprecated:
@@ -78,6 +79,7 @@
     'NeuralFingerprint',
     'ViSNet',
     'GRetriever',
+    'GITMol',
     'MoleculeGPT',
     'GLEM',
 ]
diff --git a/torch_geometric/nn/models/git_mol.py b/torch_geometric/nn/models/git_mol.py
new file mode 100644
index 000000000000..c06b44671931
--- /dev/null
+++ b/torch_geometric/nn/models/git_mol.py
@@ -0,0 +1,336 @@
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import BatchNorm1d, LayerNorm, Linear, ReLU, Sequential
+
+from torch_geometric.nn import GINEConv
+from torch_geometric.nn.nlp import SentenceTransformer, VisionTransformer
+from torch_geometric.utils import add_self_loops, to_dense_batch
+
+
+class GraphEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        num_layers: int,
+        in_channels: int,
+        dropout: float = 0.,
+        num_atom_type: int = 120,
+        num_chirality_tag: int = 3,
+        num_bond_type: int = 6,
+        num_bond_direction: int = 3,
+    ) -> None:
+        super().__init__()
+
+        self.num_layers = num_layers
+        self.dropout = dropout
+
+        self.x_embed1 = torch.nn.Embedding(num_atom_type, in_channels)
+        self.x_embed2 = torch.nn.Embedding(num_chirality_tag, in_channels)
+        self.edge_embed1 = torch.nn.Embedding(num_bond_type, in_channels)
+        self.edge_embed2 = torch.nn.Embedding(num_bond_direction, in_channels)
+
+        self.gnns = torch.nn.ModuleList()
+        self.batch_norms = torch.nn.ModuleList()
+        for _ in range(num_layers):
+            self.gnns.append(
+                GINEConv(
+                    nn=Sequential(
+                        Linear(in_channels, in_channels * 2),
+                        ReLU(),
+                        Linear(in_channels * 2, in_channels),
+                    ),
+                    train_eps=True,
+                    edge_dim=in_channels,
+                ))
+            self.batch_norms.append(BatchNorm1d(in_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.xavier_uniform_(self.x_embed1.weight.data)
+        torch.nn.init.xavier_uniform_(self.x_embed2.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embed1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embed2.weight.data)
+
+    def forward(
+        self,
+        x: Tensor,
+        edge_index: Tensor,
+        batch: Tensor,
+        edge_attr: Tensor,
+    ) -> Tensor:
+        x = self.x_embed1(x[:, 0].long()) + self.x_embed2(x[:, 1].long())
+        edge_index, edge_attr = add_self_loops(
+            edge_index,
+            edge_attr,
+            fill_value=0,
+            num_nodes=x.size(0),
+        )
+        edge_attr = self.edge_embed1(edge_attr[:, 0]) + self.edge_embed2(
+            edge_attr[:, 1])
+        for i, (gnn, bn) in enumerate(zip(self.gnns, self.batch_norms)):
+            x = gnn(x, edge_index, edge_attr)
+            x = bn(x)
+            if i < self.num_layers - 1:
+                x = F.relu(x)
+            x = F.dropout(x, self.dropout, training=self.training)
+
+        x, mask = to_dense_batch(x, batch)
+        return x, mask
+
+
+class GITFormer(torch.nn.Module):
+    def __init__(
+        self,
+        num_query_token: int,
+        vision_graph_width: int,
+        cross_attention_freq: int = 2,
+    ):
+        super().__init__()
+        from transformers import AutoConfig, AutoModel
+
+        config = AutoConfig.from_pretrained("allenai/scibert_scivocab_uncased")
+        config.encoder_width = vision_graph_width
+        # insert cross-attention layer every other block
+        config.add_cross_attention = True
+        config.is_decoder = True
+        config.cross_attention_freq = cross_attention_freq
+        config.query_length = num_query_token
+        self.Qformer = AutoModel.from_pretrained(
+            "allenai/scibert_scivocab_uncased", config=config)
+        self.query_tokens = torch.nn.Parameter(
+            torch.zeros(1, num_query_token, config.hidden_size))
+        self.query_tokens.data.normal_(mean=0.0, std=config.initializer_range)
+
+
+class GITMol(torch.nn.Module):
+    r"""The GITMol model from the `"GIT-Mol: A Multi-modal Large Language
+    Model for Molecular Science with Graph, Image, and Text"
+    <https://arxiv.org/pdf/2308.06911>`_ paper.
+
+    .. note::
+        For an example of using :class:`GITMol`, see
+        `examples/llm/git_mol.py <https://github.com/pyg-team/
+        pytorch_geometric/blob/master/examples/llm/git_mol.py>`_.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        # graph
+        self.graph_encoder = GraphEncoder(num_layers=2, in_channels=16)
+        self.graph_proj = Linear(16, 768)
+        self.ln_graph = LayerNorm(768)
+        # text
+        self.text_encoder = SentenceTransformer(
+            model_name='allenai/scibert_scivocab_uncased',
+            pooling_strategy='last_hidden_state',
+        )
+        self.text_proj = Linear(768, 768)
+        self.ln_text = LayerNorm(768)
+        # vision
+        self.vision_encoder = VisionTransformer(
+            model_name='microsoft/swin-base-patch4-window7-224', )
+        self.vision_proj = Linear(1024, 768)
+        self.ln_vision = LayerNorm(768)
+        # cross-attention
+        self.gitformer = GITFormer(384, 768)
+
+        self.xtm_head = torch.nn.ModuleDict({
+            'image':
+            Linear(self.gitformer.Qformer.config.hidden_size, 2),
+            'graph':
+            Linear(self.gitformer.Qformer.config.hidden_size, 2),
+            'cs_text':
+            Linear(self.gitformer.Qformer.config.hidden_size, 2),
+        })
+
+        self.xtc_proj = torch.nn.ModuleDict({
+            'image':
+            Linear(self.gitformer.Qformer.config.hidden_size, 768),
+            'graph':
+            Linear(self.gitformer.Qformer.config.hidden_size, 768),
+            'cs_text':
+            Linear(self.gitformer.Qformer.config.hidden_size, 768),
+        })
+        self.temp = torch.nn.Parameter(0.07 * torch.ones([]))
+        self.model_freeze()
+
+    def model_freeze(self) -> None:
+        for param in self.graph_encoder.parameters():
+            param.requires_grad = False
+
+        for param in self.vision_encoder.parameters():
+            param.requires_grad = False
+
+    def forward(
+        self,
+        x: Tensor,
+        edge_index: Tensor,
+        batch: Tensor,
+        edge_attr: Optional[Tensor],
+        smiles: List[str],
+        images: Tensor,
+        captions: List[str],
+    ) -> Tensor:
+        batch_size = len(smiles)
+
+        x_vision = self.vision_encoder(images)
+        x_vision = self.vision_proj(x_vision)
+        x_vision = self.ln_vision(x_vision)  # [bs, patch_len, d]
+        vision_atts = torch.ones(x_vision.size()[:-1],
+                                 dtype=torch.long).to(x_vision.device)
+        vision_targets = torch.arange(batch_size).to(x_vision.device)
+
+        x_graph, graph_atts = self.graph_encoder(x, edge_index, batch,
+                                                 edge_attr)
+        x_graph = self.graph_proj(x_graph)
+        x_graph = self.ln_graph(x_graph)  # [bs, node_len, d]
+        graph_targets = torch.arange(batch_size).to(x_graph.device)
+
+        x_smiles = self.text_encoder.encode(smiles)  # [bs, seq_len, d]
+        smiles_atts = torch.ones(x_smiles.size()[:-1],
+                                 dtype=torch.long).to(x_smiles.device)
+        smiles_targets = torch.arange(batch_size).to(x_smiles.device)
+
+        caption_input_ids, caption_attention_masks = self.text_encoder.get_input_ids(  # noqa: E501
+            captions)
+
+        text_output = self.gitformer.Qformer(
+            caption_input_ids,
+            attention_mask=caption_attention_masks,
+            return_dict=True,
+        )
+        text_feat = F.normalize(
+            self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1)
+
+        loss = 0
+        for x_embed, x_atts, x_targets, modal in zip(
+            [x_graph, x_smiles, x_vision],
+            [graph_atts, smiles_atts, vision_atts],
+            [graph_targets, smiles_targets, vision_targets],
+            ['graph', 'cs_text', 'image'],
+        ):
+            loss += self._calc_xtc_loss(x_embed, x_atts, x_targets, text_feat,
+                                        modal)
+            loss += self._calc_xtm_loss(x_embed, caption_input_ids,
+                                        caption_attention_masks, modal)
+
+        return loss / 6
+
+    def _calc_xtm_loss(
+        self,
+        x_embeds: Tensor,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        modal: str,
+    ) -> Tensor:
+        # Initializing lists to hold the original and negative samples
+        x_embeds_list = []
+        text_input_ids_list = []
+        text_attention_mask_list = []
+
+        batch_size = x_embeds.size(0)
+        for i in range(batch_size):
+            # Original samples
+            x_embeds_list.append(x_embeds[i])
+            text_input_ids_list.append(input_ids[i, :])
+            text_attention_mask_list.append(attention_mask[i, :])
+
+            if batch_size > 1:
+                # Negative samples (neg_text_input_ids corresponds to x_embeds)
+                neg_text_input_ids = input_ids[i - 1 if i == batch_size -
+                                               1 else i + 1, :]
+                neg_text_attention_mask = attention_mask[i -
+                                                         1 if i == batch_size -
+                                                         1 else i + 1, :]
+                text_input_ids_list.append(neg_text_input_ids)
+                text_attention_mask_list.append(neg_text_attention_mask)
+                x_embeds_list.append(x_embeds[i, :])
+
+                # Negative samples (text_input_ids corresponds to neg_x_embeds)
+                neg_x_embeds = x_embeds[i - 1 if i == batch_size - 1 else i +
+                                        1, :]
+                x_embeds_list.append(neg_x_embeds)
+                text_input_ids_list.append(input_ids[i, :])
+                text_attention_mask_list.append(attention_mask[i, :])
+
+        # Stack all samples into two large tensors
+        x_embeds_all = torch.stack(x_embeds_list, dim=1) \
+            .reshape(-1, x_embeds.size(1), x_embeds.size(2))
+        text_input_ids_all = torch.stack(text_input_ids_list, dim=1) \
+            .reshape(-1, input_ids.size(1))
+        # Create image attention masks for the concatenated tensor
+        image_attns_all = torch.ones(x_embeds_all.size()[:-1],
+                                     dtype=torch.long).to(x_embeds_all.device)
+        query_tokens_xtm = self.gitformer.query_tokens.expand(
+            text_input_ids_all.shape[0], -1, -1)
+        query_attns_xtm = torch.ones(query_tokens_xtm.size()[:-1],
+                                     dtype=torch.long).to(x_embeds_all.device)
+
+        output_xtm = self.gitformer.Qformer(
+            inputs_embeds=query_tokens_xtm,
+            attention_mask=query_attns_xtm,
+            encoder_hidden_states=x_embeds_all,
+            encoder_attention_mask=image_attns_all,
+            return_dict=True,
+        ).last_hidden_state
+
+        xtm_embeddings = output_xtm[:, :query_tokens_xtm.size(1), :]
+
+        xtm_logit = self.xtm_head[modal](xtm_embeddings).mean(dim=1)
+        # Create labels: 1 for the original samples, 0 for the negative samples
+        if batch_size > 1:
+            labels = torch.cat(
+                [torch.ones(batch_size),
+                 torch.zeros(batch_size * 2)], dim=0)
+        else:
+            labels = torch.ones(batch_size)
+        labels = labels.long().to(xtm_logit.device)
+
+        # Calculate cross entropy loss
+        return F.cross_entropy(xtm_logit, labels)
+
+    def _calc_xtc_loss(
+        self,
+        x_embeds: Tensor,
+        x_atts: Tensor,
+        x_targets: Tensor,
+        text_feat: Tensor,
+        modal: str,
+    ) -> Tensor:
+        query_tokens = self.gitformer.query_tokens.expand(
+            x_embeds.shape[0], -1, -1)
+
+        query_output = self.gitformer.Qformer(
+            inputs_embeds=query_tokens,
+            encoder_hidden_states=x_embeds,
+            encoder_attention_mask=x_atts,
+            return_dict=True,
+        ).last_hidden_state
+
+        x_feats = F.normalize(self.xtc_proj[modal](query_output), dim=-1)
+
+        sim_q2t = torch.matmul(
+            x_feats.unsqueeze(1),
+            text_feat.unsqueeze(-1),
+        ).squeeze(-1)
+
+        # modal-text similarity: aggregate across all query tokens
+        sim_x2t, _ = sim_q2t.max(-1)
+        sim_x2t = sim_x2t / self.temp
+
+        # text-query similarity
+        sim_t2q = torch.matmul(
+            text_feat.unsqueeze(1).unsqueeze(1),
+            x_feats.permute(0, 2, 1),
+        ).squeeze(-2)
+
+        # text-modal similarity: aggregate across all query tokens
+        sim_t2x, _ = sim_t2q.max(-1)
+        sim_t2x = sim_t2x / self.temp
+
+        loss_itc = (
+            F.cross_entropy(sim_x2t, x_targets, label_smoothing=0.1) +
+            F.cross_entropy(sim_t2x, x_targets, label_smoothing=0.1)) / 2
+
+        return loss_itc
diff --git a/torch_geometric/nn/nlp/__init__.py b/torch_geometric/nn/nlp/__init__.py
index c101a359e3f5..434619352460 100644
--- a/torch_geometric/nn/nlp/__init__.py
+++ b/torch_geometric/nn/nlp/__init__.py
@@ -1,7 +1,9 @@
 from .sentence_transformer import SentenceTransformer
+from .vision_transformer import VisionTransformer
 from .llm import LLM
 
 __all__ = classes = [
     'SentenceTransformer',
+    'VisionTransformer',
     'LLM',
 ]
diff --git a/torch_geometric/nn/nlp/sentence_transformer.py b/torch_geometric/nn/nlp/sentence_transformer.py
index 715f343bfc19..6d904b8e0fbf 100644
--- a/torch_geometric/nn/nlp/sentence_transformer.py
+++ b/torch_geometric/nn/nlp/sentence_transformer.py
@@ -48,6 +48,36 @@ def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
         emb = F.normalize(emb, p=2, dim=1)
         return emb
 
+    def get_input_ids(
+        self,
+        text: List[str],
+        batch_size: Optional[int] = None,
+        output_device: Optional[Union[torch.device, str]] = None,
+    ) -> Tensor:
+        is_empty = len(text) == 0
+        text = ['dummy'] if is_empty else text
+
+        batch_size = len(text) if batch_size is None else batch_size
+
+        input_ids: List[Tensor] = []
+        attention_masks: List[Tensor] = []
+        for start in range(0, len(text), batch_size):
+            token = self.tokenizer(
+                text[start:start + batch_size],
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+            )
+            input_ids.append(token.input_ids.to(self.device))
+            attention_masks.append(token.attention_mask.to(self.device))
+
+        def _out(x: List[Tensor]) -> Tensor:
+            out = torch.cat(x, dim=0) if len(x) > 1 else x[0]
+            out = out[:0] if is_empty else out
+            return out.to(output_device)
+
+        return _out(input_ids), _out(attention_masks)
+
     @property
     def device(self) -> torch.device:
         return next(iter(self.model.parameters())).device
diff --git a/torch_geometric/nn/nlp/vision_transformer.py b/torch_geometric/nn/nlp/vision_transformer.py
new file mode 100644
index 000000000000..517a524f4d84
--- /dev/null
+++ b/torch_geometric/nn/nlp/vision_transformer.py
@@ -0,0 +1,33 @@
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+
+class VisionTransformer(torch.nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+    ) -> None:
+        super().__init__()
+        self.model_name = model_name
+
+        from transformers import SwinConfig, SwinModel
+
+        self.config = SwinConfig.from_pretrained(model_name)
+        self.model = SwinModel(self.config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        images: Tensor,
+        output_device: Optional[Union[torch.device, str]] = None,
+    ) -> Tensor:
+        return self.model(images).last_hidden_state.to(output_device)
+
+    @property
+    def device(self) -> torch.device:
+        return next(iter(self.model.parameters())).device
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(model_name={self.model_name})'

From f8760ec39951715def19a95ee62ae4fc3ed056fd Mon Sep 17 00:00:00 2001
From: Matthias Fey <matthias.fey@tu-dortmund.de>
Date: Mon, 25 Nov 2024 11:24:00 +0100
Subject: [PATCH 06/10] Run `GitMolDataset` tests only in full test mode
 (#9804)

---
 test/datasets/test_git_mol_dataset.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/datasets/test_git_mol_dataset.py b/test/datasets/test_git_mol_dataset.py
index 3da72f2f2182..f4e652b6ae43 100644
--- a/test/datasets/test_git_mol_dataset.py
+++ b/test/datasets/test_git_mol_dataset.py
@@ -1,16 +1,19 @@
+from typing import Tuple
+
 import pytest
 
 from torch_geometric.datasets import GitMolDataset
-from torch_geometric.testing import withPackage
+from torch_geometric.testing import onlyFullTest, withPackage
 
 
+@onlyFullTest
 @withPackage('torchvision', 'rdkit', 'PIL')
 @pytest.mark.parametrize('split', [
     (0, 3610),
     (1, 451),
     (2, 451),
 ])
-def test_git_mol_dataset(split):
+def test_git_mol_dataset(split: Tuple[int, int]) -> None:
     dataset = GitMolDataset(root='./data/GITMol', split=split[0])
 
     assert len(dataset) == split[1]

From bd7876c86046a4ed8e17583bc0bbaebeb0518510 Mon Sep 17 00:00:00 2001
From: Rishi Puri <puririshi98@berkeley.edu>
Date: Mon, 25 Nov 2024 02:24:23 -0800
Subject: [PATCH 07/10] fix for cugraph (#9803)

---
 examples/multi_gpu/papers100m_gcn_cugraph.py           | 4 ++--
 examples/multi_gpu/papers100m_gcn_cugraph_multinode.py | 4 ++--
 examples/ogbn_papers_100m_cugraph.py                   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py
index 5413492a5bc5..799b6317c374 100644
--- a/examples/multi_gpu/papers100m_gcn_cugraph.py
+++ b/examples/multi_gpu/papers100m_gcn_cugraph.py
@@ -86,8 +86,8 @@ def run(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan_out,
     )] = ixr
 
     feature_store = TensorDictFeatureStore()
-    feature_store['node', 'x'] = data.x
-    feature_store['node', 'y'] = data.y
+    feature_store['node', 'x', None] = data.x
+    feature_store['node', 'y', None] = data.y
 
     dist.barrier()
 
diff --git a/examples/multi_gpu/papers100m_gcn_cugraph_multinode.py b/examples/multi_gpu/papers100m_gcn_cugraph_multinode.py
index 4ea78eb64ad6..eb074defeafe 100644
--- a/examples/multi_gpu/papers100m_gcn_cugraph_multinode.py
+++ b/examples/multi_gpu/papers100m_gcn_cugraph_multinode.py
@@ -142,9 +142,9 @@ def load_partitioned_data(rank, edge_path, feature_path, label_path, meta_path,
         split_idx[split] = fs.torch_load(path)
 
     path = osp.join(feature_path, f'rank={rank}_x.pt')
-    feature_store['node', 'x'] = fs.torch_load(path)
+    feature_store['node', 'x', None] = fs.torch_load(path)
     path = osp.join(feature_path, f'rank={rank}_y.pt')
-    feature_store['node', 'y'] = fs.torch_load(path)
+    feature_store['node', 'y', None] = fs.torch_load(path)
 
     eix = fs.torch_load(osp.join(edge_path, f'rank={rank}.pt'))
     graph_store[dict(
diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py
index 7c1da866056a..8ae35cd776a4 100644
--- a/examples/ogbn_papers_100m_cugraph.py
+++ b/examples/ogbn_papers_100m_cugraph.py
@@ -63,8 +63,8 @@
 )] = data.edge_index
 
 feature_store = cugraph_pyg.data.TensorDictFeatureStore()
-feature_store['node', 'x'] = data.x
-feature_store['node', 'y'] = data.y
+feature_store['node', 'x', None] = data.x
+feature_store['node', 'y', None] = data.y
 
 data = (feature_store, graph_store)
 

From 742f790090e420b8a3ebff295e5280a373d8aed1 Mon Sep 17 00:00:00 2001
From: zaristei <zaristei@berkeley.edu>
Date: Mon, 25 Nov 2024 17:29:25 -0800
Subject: [PATCH 08/10] G-retriever API updates (NVTX, Remote Backend, Large
 Graph Indexer, Examples) (#9666)

Follow up to [PR
9597](https://github.com/pyg-team/pytorch_geometric/pull/9597). Includes
multiple changes related to LLM+GNN experiments and scaling up to a
remote backend. Including:

- LargeGraphIndexer for building a large knowledge graph locally from
multiple samples in an arbitrary dataset
- Remote Backend Loader and examples for deploying a Retrieval algorithm
to a third party backend FeatureStore or GraphStore
- NVTX profiling tools for nsys users
- Quality of Life improvements and benchmarking scripts for G-Retriever.

Updates using these for WebQSP will be moved to a seperate PR

UPDATE:
PR is being broken up into smaller PRs. These can be previewed here:

- https://github.com/zaristei/pytorch_geometric/pull/6
- https://github.com/zaristei/pytorch_geometric/pull/7
- https://github.com/zaristei/pytorch_geometric/pull/8

---------

Co-authored-by: Zack Aristei <zaristei@zaristei-nvidia.client.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Zachary Aristei <zaristei@gmail.com>
Co-authored-by: Rishi Puri <puririshi98@berkeley.edu>
---
 CHANGELOG.md                                  |   4 +
 examples/llm/README.md                        |  15 +-
 examples/llm/g_retriever.py                   |   4 +
 examples/llm/g_retriever_utils/README.md      |  11 +
 .../benchmark_model_archs_rag.py              | 105 +++
 .../llm/g_retriever_utils/minimal_demo.py     | 638 +++++++++++++++++
 .../g_retriever_utils/rag_backend_utils.py    | 224 ++++++
 .../g_retriever_utils/rag_feature_store.py    | 189 +++++
 .../llm/g_retriever_utils/rag_generate.py     | 139 ++++
 .../llm/g_retriever_utils/rag_graph_store.py  | 107 +++
 examples/llm/multihop_rag/README.md           |   9 +
 .../llm/multihop_rag/multihop_download.sh     |  12 +
 .../llm/multihop_rag/multihop_preprocess.py   | 276 +++++++
 .../llm/multihop_rag/rag_generate_multihop.py |  88 +++
 examples/llm/nvtx_examples/README.md          |   7 +
 .../nvtx_examples/nvtx_rag_backend_example.py | 144 ++++
 examples/llm/nvtx_examples/nvtx_run.sh        |  27 +
 .../llm/nvtx_examples/nvtx_webqsp_example.py  |  22 +
 test/data/test_large_graph_indexer.py         | 177 +++++
 test/nn/models/test_g_retriever.py            |  49 ++
 test/profile/test_nvtx.py                     | 136 ++++
 torch_geometric/data/__init__.py              |   5 +
 torch_geometric/data/large_graph_indexer.py   | 677 ++++++++++++++++++
 torch_geometric/loader/__init__.py            |   2 +
 torch_geometric/loader/rag_loader.py          | 106 +++
 torch_geometric/nn/models/g_retriever.py      |  13 +-
 torch_geometric/profile/__init__.py           |   2 +
 torch_geometric/profile/nvtx.py               |  66 ++
 28 files changed, 3247 insertions(+), 7 deletions(-)
 create mode 100644 examples/llm/g_retriever_utils/README.md
 create mode 100644 examples/llm/g_retriever_utils/benchmark_model_archs_rag.py
 create mode 100644 examples/llm/g_retriever_utils/minimal_demo.py
 create mode 100644 examples/llm/g_retriever_utils/rag_backend_utils.py
 create mode 100644 examples/llm/g_retriever_utils/rag_feature_store.py
 create mode 100644 examples/llm/g_retriever_utils/rag_generate.py
 create mode 100644 examples/llm/g_retriever_utils/rag_graph_store.py
 create mode 100644 examples/llm/multihop_rag/README.md
 create mode 100644 examples/llm/multihop_rag/multihop_download.sh
 create mode 100644 examples/llm/multihop_rag/multihop_preprocess.py
 create mode 100644 examples/llm/multihop_rag/rag_generate_multihop.py
 create mode 100644 examples/llm/nvtx_examples/README.md
 create mode 100644 examples/llm/nvtx_examples/nvtx_rag_backend_example.py
 create mode 100644 examples/llm/nvtx_examples/nvtx_run.sh
 create mode 100644 examples/llm/nvtx_examples/nvtx_webqsp_example.py
 create mode 100644 test/data/test_large_graph_indexer.py
 create mode 100644 test/profile/test_nvtx.py
 create mode 100644 torch_geometric/data/large_graph_indexer.py
 create mode 100644 torch_geometric/loader/rag_loader.py
 create mode 100644 torch_geometric/profile/nvtx.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 86e06d8835e8..341be665fabf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added various GRetriever Architecture Benchmarking examples ([#9666](https://github.com/pyg-team/pytorch_geometric/pull/9666))
+- Added `profiler.nvtxit` with some examples ([#9666](https://github.com/pyg-team/pytorch_geometric/pull/9666))
+- Added `loader.RagQueryLoader` with Remote Backend Example ([#9666](https://github.com/pyg-team/pytorch_geometric/pull/9666))
+- Added `data.LargeGraphIndexer` ([#9666](https://github.com/pyg-team/pytorch_geometric/pull/9666))
 - Added `GIT-Mol` ([#9730](https://github.com/pyg-team/pytorch_geometric/pull/9730))
 - Added comment in `g_retriever.py` pointing to `Neo4j` Graph DB integration demo ([#9748](https://github.com/pyg-team/pytorch_geometric/pull/9797))
 - Added `MoleculeGPT` example ([#9710](https://github.com/pyg-team/pytorch_geometric/pull/9710))
diff --git a/examples/llm/README.md b/examples/llm/README.md
index eb471563de8e..4503e28ce6ee 100644
--- a/examples/llm/README.md
+++ b/examples/llm/README.md
@@ -1,8 +1,11 @@
 # Examples for Co-training LLMs and GNNs
 
-| Example                                | Description                                                                                                                                                                                    |
-| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`g_retriever.py`](./g_retriever.py)   | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information                                    |
-| [`git_mol.py`](./git_mol.py)           | Example for GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text                                                                                      |
-| [`molecule_gpt.py`](./molecule_gpt.py) | Example for MoleculeGPT: Instruction Following Large Language Models for Molecular Property Prediction                                                                                         |
-| [`glem.py`](./glem.py)                 | Example for [GLEM](https://arxiv.org/abs/2210.14709), a GNN+LLM co-training model via variational Expectation-Maximization (EM) framework on node classification tasks to achieve SOTA results |
+| Example                                      | Description                                                                                                                                                                                    |
+| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`g_retriever.py`](./g_retriever.py)         | Example for Retrieval-Augmented Generation (RAG) w/ GNN+LLM by co-training `LLAMA2` with `GAT` for answering questions based on knowledge graph information                                    |
+| [`g_retriever_utils/`](./g_retriever_utils/) | Contains multiple scripts for benchmarking GRetriever's architecture and evaluating different retrieval methods.                                                                               |
+| [`multihop_rag/`](./multihop_rag/)           | Contains starter code and an example run for building a Multi-hop dataset using WikiHop5M and 2WikiMultiHopQA                                                                                  |
+| [`nvtx_examples/`](./nvtx_examples/)         | Contains examples of how to wrap functions using the NVTX profiler for CUDA runtime analysis.                                                                                                  |
+| [`molecule_gpt.py`](./molecule_gpt.py)       | Example for MoleculeGPT: Instruction Following Large Language Models for Molecular Property Prediction                                                                                         |
+| [`glem.py`](./glem.py)                       | Example for [GLEM](https://arxiv.org/abs/2210.14709), a GNN+LLM co-training model via variational Expectation-Maximization (EM) framework on node classification tasks to achieve SOTA results |
+| [`git_mol.py`](./git_mol.py)                 | Example for GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text                                                                                      |
diff --git a/examples/llm/g_retriever.py b/examples/llm/g_retriever.py
index 984ce3f010e7..a48901f1ff0e 100644
--- a/examples/llm/g_retriever.py
+++ b/examples/llm/g_retriever.py
@@ -11,6 +11,7 @@
 https://github.com/neo4j-product-examples/neo4j-gnn-llm-example
 """
 import argparse
+import gc
 import math
 import os.path as osp
 import re
@@ -145,6 +146,9 @@ def adjust_learning_rate(param_group, LR, epoch):
     test_loader = DataLoader(test_dataset, batch_size=eval_batch_size,
                              drop_last=False, pin_memory=True, shuffle=False)
 
+    # To clean up after Data Preproc
+    gc.collect()
+    torch.cuda.empty_cache()
     gnn = GAT(
         in_channels=1024,
         hidden_channels=hidden_channels,
diff --git a/examples/llm/g_retriever_utils/README.md b/examples/llm/g_retriever_utils/README.md
new file mode 100644
index 000000000000..e072e6746b7c
--- /dev/null
+++ b/examples/llm/g_retriever_utils/README.md
@@ -0,0 +1,11 @@
+# Examples for LLM and GNN co-training
+
+| Example                                                          | Description                                                                                                                                                               |
+| ---------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`rag_feature_store.py`](./rag_feature_store.py)                 | A Proof of Concept Implementation of a RAG enabled FeatureStore that can serve as a starting point for implementing a custom RAG Remote Backend                           |
+| [`rag_graph_store.py`](./rag_graph_store.py)                     | A Proof of Concept Implementation of a RAG enabled GraphStore that can serve as a starting point for implementing a custom RAG Remote Backend                             |
+| [`rag_backend_utils.py`](./rag_backend_utils.py)                 | Utility functions used for loading a series of Knowledge Graph Triplets into the Remote Backend defined by a FeatureStore and GraphStore                                  |
+| [`rag_generate.py`](./rag_generate.py)                           | Script for generating a unique set of subgraphs from the WebQSP dataset using a custom defined retrieval algorithm (defaults to the FeatureStore and GraphStore provided) |
+| [`benchmark_model_archs_rag.py`](./benchmark_model_archs_rag.py) | Script for running a GNN/LLM benchmark on GRetriever while grid searching relevent architecture parameters and datasets.                                                  |
+
+NOTE: Evaluating performance on GRetriever with smaller sample sizes may result in subpar performance. It is not unusual for the fine-tuned model/LLM to perform worse than an untrained LLM on very small sample sizes.
diff --git a/examples/llm/g_retriever_utils/benchmark_model_archs_rag.py b/examples/llm/g_retriever_utils/benchmark_model_archs_rag.py
new file mode 100644
index 000000000000..6522aafca68b
--- /dev/null
+++ b/examples/llm/g_retriever_utils/benchmark_model_archs_rag.py
@@ -0,0 +1,105 @@
+"""Used to benchmark the performance of an untuned/fine tuned LLM against
+GRetriever with various architectures and layer depths.
+"""
+# %%
+import argparse
+import sys
+
+import torch
+
+from torch_geometric.datasets import WebQSPDataset
+from torch_geometric.nn.models import GAT, MLP, GRetriever
+
+sys.path.append('..')
+from minimal_demo import (  # noqa: E402 # isort:skip
+    benchmark_models, get_loss, inference_step,
+)
+
+# %%
+parser = argparse.ArgumentParser(
+    description="""Benchmarker for GRetriever\n""" +
+    """NOTE: Evaluating with smaller samples may result in poorer""" +
+    """ performance for the trained models compared to """ +
+    """untrained models.""")
+parser.add_argument("--hidden_channels", type=int, default=1024)
+parser.add_argument("--learning_rate", type=float, default=1e-5)
+parser.add_argument("--epochs", type=int, default=2)
+parser.add_argument("--batch_size", type=int, default=8)
+parser.add_argument("--eval_batch_size", type=int, default=16)
+parser.add_argument("--tiny_llama", action='store_true')
+
+parser.add_argument("--dataset_path", type=str, required=False)
+# Default to WebQSP split
+parser.add_argument("--num_train", type=int, default=2826)
+parser.add_argument("--num_val", type=int, default=246)
+parser.add_argument("--num_test", type=int, default=1628)
+
+args = parser.parse_args()
+
+# %%
+hidden_channels = args.hidden_channels
+lr = args.learning_rate
+epochs = args.epochs
+batch_size = args.batch_size
+eval_batch_size = args.eval_batch_size
+
+# %%
+if not args.dataset_path:
+    ds = WebQSPDataset('benchmark_archs', verbose=True, force_reload=True)
+else:
+    # We just assume that the size of the dataset accomodates the
+    # train/val/test split, because checking may be expensive.
+    dataset = torch.load(args.dataset_path)
+
+    class MockDataset:
+        """Utility class to patch the fields in WebQSPDataset used by
+        GRetriever.
+        """
+        def __init__(self) -> None:
+            pass
+
+        @property
+        def split_idxs(self) -> dict:
+            # Imitates the WebQSP split method
+            return {
+                "train":
+                torch.arange(args.num_train),
+                "val":
+                torch.arange(args.num_val) + args.num_train,
+                "test":
+                torch.arange(args.num_test) + args.num_train + args.num_val,
+            }
+
+        def __getitem__(self, idx: int):
+            return dataset[idx]
+
+    ds = MockDataset()
+
+# %%
+model_names = []
+model_classes = []
+model_kwargs = []
+model_type = ["GAT", "MLP"]
+models = {"GAT": GAT, "MLP": MLP}
+# Use to vary the depth of the GNN model
+num_layers = [4]
+# Use to vary the number of LLM tokens reserved for GNN output
+num_tokens = [1]
+for m_type in model_type:
+    for n_layer in num_layers:
+        for n_tokens in num_tokens:
+            model_names.append(f"{m_type}_{n_layer}_{n_tokens}")
+            model_classes.append(GRetriever)
+            kwargs = dict(gnn_hidden_channels=hidden_channels,
+                          num_gnn_layers=n_layer, gnn_to_use=models[m_type],
+                          mlp_out_tokens=n_tokens)
+            if args.tiny_llama:
+                kwargs['llm_to_use'] = 'TinyLlama/TinyLlama-1.1B-Chat-v0.1'
+                kwargs['mlp_out_dim'] = 2048
+                kwargs['num_llm_params'] = 1
+            model_kwargs.append(kwargs)
+
+# %%
+benchmark_models(model_classes, model_names, model_kwargs, ds, lr, epochs,
+                 batch_size, eval_batch_size, get_loss, inference_step,
+                 skip_LLMs=False, tiny_llama=args.tiny_llama, force=True)
diff --git a/examples/llm/g_retriever_utils/minimal_demo.py b/examples/llm/g_retriever_utils/minimal_demo.py
new file mode 100644
index 000000000000..bdd78c3180cb
--- /dev/null
+++ b/examples/llm/g_retriever_utils/minimal_demo.py
@@ -0,0 +1,638 @@
+"""This example implements the G-Retriever model
+(https://arxiv.org/abs/2402.07630) using PyG.
+
+G-Retriever significantly reduces hallucinations by 54% compared to the
+stand-alone LLM baseline.
+
+Requirements:
+`pip install datasets transformers pcst_fast sentencepiece accelerate`
+"""
+import argparse
+import gc
+import math
+import multiprocessing as mp
+import re
+import sys
+import time
+from os import path
+from typing import Any, Callable, Dict, List, Type
+
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.utils import clip_grad_norm_
+from tqdm import tqdm
+
+from torch_geometric import seed_everything
+from torch_geometric.data import Dataset
+from torch_geometric.datasets import WebQSPDataset
+from torch_geometric.loader import DataLoader
+from torch_geometric.nn.models import GAT, GRetriever
+from torch_geometric.nn.nlp import LLM
+
+# NOTE: This used to be merged in the G-Retriever example.
+# FIXME: Getting the demos working like before is a WIP
+sys.path.append('..')
+from g_retriever import (  # noqa: E402 # isort:skip
+    compute_metrics, load_params_dict, save_params_dict,
+)
+
+
+def _detect_hallucinate(inp):
+    pred, label = inp
+    try:
+        split_pred = pred.split('[/s]')[0].strip().split('|')
+        correct_hit = len(re.findall(split_pred[0], label)) > 0
+        correct_hit = correct_hit or any(
+            [label_i in pred.lower() for label_i in label.split('|')])
+        hallucination = not correct_hit
+        return hallucination
+    except:  # noqa
+        return "skip"
+
+
+def detect_hallucinate(pred_batch, label_batch):
+    r"""An approximation for the unsolved task of detecting hallucinations.
+    We define a hallucination as an output that contains no instances of
+    acceptable label.
+    """
+    with mp.Pool(len(pred_batch)) as p:
+        res = p.map(_detect_hallucinate, zip(pred_batch, label_batch))
+    return res
+
+
+def compute_n_parameters(model: torch.nn.Module) -> int:
+    return sum([p.numel() for p in model.parameters() if p.requires_grad])
+
+
+def get_loss(model, batch, model_save_name) -> Tensor:
+    if model_save_name == 'llm':
+        return model(batch.question, batch.label, batch.desc)
+    else:
+        return model(batch.question, batch.x, batch.edge_index, batch.batch,
+                     batch.label, batch.edge_attr, batch.desc)
+
+
+def inference_step(model, batch, model_save_name):
+    if model_save_name == 'llm':
+        return model.inference(batch.question, batch.desc)
+    else:
+        return model.inference(batch.question, batch.x, batch.edge_index,
+                               batch.batch, batch.edge_attr, batch.desc)
+
+
+# TODO: Merge with G-Retriever example and make sure changes still work
+def train(
+    num_epochs,
+    hidden_channels,
+    num_gnn_layers,
+    batch_size,
+    eval_batch_size,
+    lr,
+    checkpointing=False,
+    tiny_llama=False,
+    model=None,
+    dataset=None,
+    model_save_name=None,
+):
+    def adjust_learning_rate(param_group, LR, epoch):
+        # Decay the learning rate with half-cycle cosine after warmup
+        min_lr = 5e-6
+        warmup_epochs = 1
+        if epoch < warmup_epochs:
+            lr = LR
+        else:
+            lr = min_lr + (LR - min_lr) * 0.5 * (
+                1.0 + math.cos(math.pi * (epoch - warmup_epochs) /
+                               (num_epochs - warmup_epochs)))
+        param_group['lr'] = lr
+        return lr
+
+    start_time = time.time()
+    seed_everything(42)
+    if dataset is None:
+        dataset = WebQSPDataset()
+        gc.collect()
+    elif not isinstance(dataset, Dataset) and callable(dataset):
+        dataset = dataset()
+        gc.collect()
+    idx_split = dataset.split_idxs
+
+    # Step 1: Build Node Classification Dataset
+    train_dataset = [dataset[i] for i in idx_split['train']]
+    val_dataset = [dataset[i] for i in idx_split['val']]
+    test_dataset = [dataset[i] for i in idx_split['test']]
+
+    train_loader = DataLoader(train_dataset, batch_size=batch_size,
+                              drop_last=True, pin_memory=True, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=eval_batch_size,
+                            drop_last=False, pin_memory=True, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=eval_batch_size,
+                             drop_last=False, pin_memory=True, shuffle=False)
+
+    if model is None:
+        gc.collect()
+        gnn = GAT(
+            in_channels=1024,
+            hidden_channels=hidden_channels,
+            out_channels=1024,
+            num_layers=num_gnn_layers,
+            heads=4,
+        )
+        if tiny_llama:
+            llm = LLM(
+                model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+                num_params=1,
+            )
+            model = GRetriever(llm=llm, gnn=gnn, mlp_out_channels=2048)
+        else:
+            llm = LLM(model_name='meta-llama/Llama-2-7b-chat-hf', num_params=7)
+            model = GRetriever(llm=llm, gnn=gnn)
+
+    if model_save_name is None:
+        model_save_name = 'gnn_llm' if num_gnn_layers is not None else 'llm'
+
+    model_save_name = 'gnn_llm' if num_gnn_layers != 0 else 'llm'
+    if model_save_name == 'llm':
+        model = llm
+
+    params = [p for _, p in model.named_parameters() if p.requires_grad]
+    optimizer = torch.optim.AdamW([
+        {
+            'params': params,
+            'lr': lr,
+            'weight_decay': 0.05
+        },
+    ], betas=(0.9, 0.95))
+    grad_steps = 2
+
+    best_epoch = 0
+    best_val_loss = float('inf')
+    for epoch in range(num_epochs):
+        model.train()
+        epoch_loss = 0
+        if epoch == 0:
+            print(f"Total Preparation Time: {time.time() - start_time:2f}s")
+            start_time = time.time()
+            print("Training beginning...")
+        epoch_str = f'Epoch: {epoch + 1}|{num_epochs}'
+        loader = tqdm(train_loader, desc=epoch_str)
+        for step, batch in enumerate(loader):
+            optimizer.zero_grad()
+            loss = get_loss(model, batch, model_save_name)
+            loss.backward()
+
+            clip_grad_norm_(optimizer.param_groups[0]['params'], 0.1)
+
+            if (step + 1) % grad_steps == 0:
+                adjust_learning_rate(optimizer.param_groups[0], lr,
+                                     step / len(train_loader) + epoch)
+
+            optimizer.step()
+            epoch_loss = epoch_loss + float(loss)
+
+            if (step + 1) % grad_steps == 0:
+                lr = optimizer.param_groups[0]['lr']
+        train_loss = epoch_loss / len(train_loader)
+        print(epoch_str + f', Train Loss: {train_loss:4f}')
+
+        val_loss = 0
+        eval_output = []
+        model.eval()
+        with torch.no_grad():
+            for step, batch in enumerate(val_loader):
+                loss = get_loss(model, batch, model_save_name)
+                val_loss += loss.item()
+            val_loss = val_loss / len(val_loader)
+            print(epoch_str + f", Val Loss: {val_loss:4f}")
+        if checkpointing and val_loss < best_val_loss:
+            print("Checkpointing best model...")
+            best_val_loss = val_loss
+            best_epoch = epoch
+            save_params_dict(model, f'{model_save_name}_best_val_loss_ckpt.pt')
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+
+    if checkpointing and best_epoch != num_epochs - 1:
+        print("Loading best checkpoint...")
+        model = load_params_dict(
+            model,
+            f'{model_save_name}_best_val_loss_ckpt.pt',
+        )
+
+    model.eval()
+    eval_output = []
+    print("Final evaluation...")
+    progress_bar_test = tqdm(range(len(test_loader)))
+    for step, batch in enumerate(test_loader):
+        with torch.no_grad():
+            pred = inference_step(model, batch, model_save_name)
+            eval_data = {
+                'pred': pred,
+                'question': batch.question,
+                'desc': batch.desc,
+                'label': batch.label
+            }
+            eval_output.append(eval_data)
+        progress_bar_test.update(1)
+
+    # Step 6 Post-processing & compute metrics
+    compute_metrics(eval_output)
+    print(f"Total Training Time: {time.time() - start_time:2f}s")
+    save_params_dict(model, f'{model_save_name}.pt')
+    torch.save(eval_output, f'{model_save_name}_eval_outs.pt')
+    print("Done!")
+    return prep_time, dataset, eval_output  # noqa: F821
+
+
+def _eval_hallucinations_on_loader(outs, loader, eval_batch_size):
+    model_save_list = []
+    model_preds = []
+    for out in outs:
+        model_preds += out['pred']
+    for i, batch in enumerate(loader):
+        correct_answer = batch.label
+
+        model_pred = model_preds[i * eval_batch_size:(i + 1) * eval_batch_size]
+        model_hallucinates = detect_hallucinate(model_pred, correct_answer)
+        model_save_list += [tup for tup in zip(model_pred, model_hallucinates)]
+    return model_save_list
+
+
+def benchmark_models(models: List[Type[nn.Module]], model_names: List[str],
+                     model_kwargs: List[Dict[str, Any]], dataset: Dataset,
+                     lr: float, epochs: int, batch_size: int,
+                     eval_batch_size: int, loss_fn: Callable,
+                     inference_fn: Callable, skip_LLMs: bool = True,
+                     tiny_llama: bool = False, checkpointing: bool = True,
+                     force: bool = False, root_dir='.'):
+    """Utility function for creating a model benchmark for GRetriever that
+    grid searches over hyperparameters.  Produces a DataFrame containing
+    metrics for each model.
+
+    Args:
+        models (List[Type[nn.Module]]): Models to be benchmarked.
+        model_names (List[str]): Name of save files for model checkpoints
+        model_kwargs (List[Dict[str, Any]]): Parameters to use for each
+            particular model.
+        dataset (Dataset): Input dataset to train on.
+        lr (float): Learning rate
+        epochs (int): Number of epochs
+        batch_size (int): Batch size for training
+        eval_batch_size (int): Batch size for eval. Also determines
+            hallucination detection concurrancy.
+        loss_fn (Callable): Loss function
+        inference_fn (Callable): Inference function
+        skip_LLMs (bool, optional): Whether to skip LLM-only runs.
+            Defaults to True.
+        tiny_llama (bool, optional): Whether to use tiny llama as LLM.
+            Defaults to False.
+        checkpointing (bool, optional): Whether to checkpoint models.
+            Defaults to True.
+        force (bool, optional): Whether to rerun already existing results.
+            Defaults to False.
+        root_dir (str, optional): Dir to save results and checkpoints in.
+            Defaults to '.'.
+    """
+    model_log: Dict[str, Dict[str, Any]] = dict()
+    idx_split = dataset.split_idxs
+    test_dataset = [dataset[i] for i in idx_split['test']]
+    loader = DataLoader(test_dataset, batch_size=eval_batch_size,
+                        drop_last=False, pin_memory=True, shuffle=False)
+
+    if not skip_LLMs:
+        if tiny_llama:
+            pure_llm = LLM(
+                model_name="TinyLlama/TinyLlama-1.1B-Chat-v0.1",
+                num_params=1,
+            )
+        else:
+            pure_llm = LLM(model_name="meta-llama/Llama-2-7b-chat-hf",
+                           num_params=7)
+
+        if force or not path.exists(root_dir + "/pure_llm_model_log.pt"):
+            model_log["pure_llm"] = dict()
+
+            pure_preds = []
+            for batch in tqdm(loader):
+                pure_llm_preds = pure_llm.inference(batch.question, batch.desc,
+                                                    max_tokens=256)
+                pure_preds += pure_llm_preds
+            pure_preds = [{"pred": pred} for pred in pure_preds]
+
+            model_log["pure_llm"]["preds"] = pure_preds
+            model_log["pure_llm"]["hallucinates_list"] = \
+                _eval_hallucinations_on_loader(pure_preds, loader,
+                                               eval_batch_size)
+            model_log["pure_llm"]["n_params"] = compute_n_parameters(pure_llm)
+            torch.save(model_log["pure_llm"],
+                       root_dir + "/pure_llm_model_log.pt")
+        else:
+            model_log["pure_llm"] = \
+                torch.load(root_dir+"/pure_llm_model_log.pt")
+
+        # LORA
+        if force or not path.exists(root_dir + "/tuned_llm_model_log.pt"):
+            model_log["tuned_llm"] = dict()
+            since = time.time()
+            gc.collect()
+            prep_time, _, lora_eval_outs = train(since, epochs, None, None,
+                                                 batch_size, eval_batch_size,
+                                                 lr, loss_fn, inference_fn,
+                                                 model=pure_llm,
+                                                 dataset=dataset)
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+            gc.collect()
+            e2e_time = round(time.time() - since, 2)
+            model_log["tuned_llm"]["prep_time"] = prep_time
+            model_log["tuned_llm"]["e2e_time"] = e2e_time
+            model_log["tuned_llm"]["eval_output"] = lora_eval_outs
+            print("E2E time (e2e_time) =", e2e_time, "seconds")
+            print("E2E tme minus Prep Time =", e2e_time - prep_time, "seconds")
+
+            model_log["tuned_llm"]["hallucinates_list"] = \
+                _eval_hallucinations_on_loader(lora_eval_outs, loader,
+                                               eval_batch_size)
+            model_log["tuned_llm"]["n_params"] = compute_n_parameters(pure_llm)
+            torch.save(model_log["tuned_llm"],
+                       root_dir + "/tuned_llm_model_log.pt")
+        else:
+            model_log["tuned_llm"] = \
+                torch.load(root_dir+"/tuned_llm_model_log.pt")
+
+        del pure_llm
+        gc.collect()
+
+    # All other models
+    for name, Model, kwargs in zip(model_names, models, model_kwargs):
+        model_log[name] = dict()
+        train_model = True
+        if path.exists(root_dir + f"/{name}.pt") and not force:
+            print(f"Model {name} appears to already exist.")
+            print("Would you like to retrain?")
+            train_model = str(input("(y/n):")).lower() == "y"
+
+        if train_model:
+            since = time.time()
+            gc.collect()
+            model = Model(**kwargs)
+            prep_time, _, model_eval_outs = train(
+                since=since, num_epochs=epochs, hidden_channels=None,
+                num_gnn_layers=None, batch_size=batch_size,
+                eval_batch_size=eval_batch_size, lr=lr, loss_fn=loss_fn,
+                inference_fn=inference_fn, checkpointing=checkpointing,
+                tiny_llama=tiny_llama, dataset=dataset,
+                model_save_name=root_dir + '/' + name, model=model)
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+            gc.collect()
+            e2e_time = round(time.time() - since, 2)
+            model_log[name]["prep_time"] = prep_time
+            model_log[name]["e2e_time"] = e2e_time
+            model_log[name]["eval_output"] = model_eval_outs
+            print("E2E time (e2e_time) =", e2e_time, "seconds")
+            print("E2E tme minus Prep Time =", e2e_time - prep_time, "seconds")
+            model_log[name]["n_params"] = compute_n_parameters(model)
+            del model
+            gc.collect()
+        else:
+            model_eval_outs = torch.load(root_dir + f"/{name}_eval_outs.pt")
+
+        # Calculate Hallucinations
+        skip_hallucination_detection = False
+
+        if path.exists(root_dir + f"/{name}_model_log.pt") and not force:
+            print(f"Saved outputs for {name} have been found.")
+            print("Would you like to redo?")
+            user_input = str(input("(y/n):")).lower()
+            skip_hallucination_detection = user_input != "y"
+
+        if not skip_hallucination_detection:
+            model_save_list = _eval_hallucinations_on_loader(
+                model_eval_outs, loader, eval_batch_size)
+
+            model_log[name]["hallucinates_list"] = model_save_list
+            torch.save(model_log[name], root_dir + f"/{name}_model_log.pt")
+        else:
+            model_log[name]["hallucinates_list"] = \
+                torch.load(
+                    root_dir+f"/{name}_model_log.pt"
+                    )["hallucinates_list"]
+
+    hal_dict = {
+        k: [tup[1] for tup in v["hallucinates_list"]]
+        for (k, v) in model_log.items()
+    }
+    hallucinates_df = pd.DataFrame(hal_dict).astype(str)
+    hallucinates_df = hallucinates_df.apply(pd.Series.value_counts).transpose()
+    hallucinates_df['e2e_time'] = pd.Series(
+        {k: v.get('e2e_time')
+         for (k, v) in model_log.items()})
+    hallucinates_df['n_params'] = pd.Series(
+        {k: v.get('n_params')
+         for (k, v) in model_log.items()})
+    print(hallucinates_df)
+    hallucinates_df.to_csv(root_dir + "/hallucinates_df.csv", index=False)
+
+
+def minimal_demo(gnn_llm_eval_outs, dataset, lr, epochs, batch_size,
+                 eval_batch_size, loss_fn, inference_fn,
+                 skip_pretrained_LLM=False, tiny_llama=False):
+    if not skip_pretrained_LLM:
+        print("First comparing against a pretrained LLM...")
+    # Step 1: Define a single batch size test loader
+    idx_split = dataset.split_idxs
+    test_dataset = [dataset[i] for i in idx_split['test']]
+    # batch size 1 loader for simplicity
+    loader = DataLoader(test_dataset, batch_size=eval_batch_size,
+                        drop_last=False, pin_memory=True, shuffle=False)
+    if tiny_llama:
+        pure_llm = LLM(
+            model_name="TinyLlama/TinyLlama-1.1B-Chat-v0.1",
+            num_params=1,
+        )
+    else:
+        pure_llm = LLM(model_name="meta-llama/Llama-2-7b-chat-hf",
+                       num_params=7)
+    if path.exists("demo_save_dict.pt"):
+        print("Saved outputs for the first step of the demo found.")
+        print("Would you like to redo?")
+        user_input = str(input("(y/n):")).lower()
+        skip_step_one = user_input == "n"
+    else:
+        skip_step_one = False
+
+    if not skip_step_one:
+        gnn_llm_hallucin_sum = 0
+        pure_llm_hallucin_sum = 0
+        gnn_save_list = []
+        untuned_llm_save_list = []
+        gnn_llm_preds = []
+        for out in gnn_llm_eval_outs:
+            gnn_llm_preds += out['pred']
+        if skip_pretrained_LLM:
+            print("Checking GNN+LLM for hallucinations...")
+        else:
+            print(
+                "Checking pretrained LLM vs trained GNN+LLM for hallucinations..."  # noqa
+            )
+        for i, batch in enumerate(tqdm(loader)):
+            question = batch.question
+            correct_answer = batch.label
+
+            gnn_llm_pred = gnn_llm_preds[i * eval_batch_size:(i + 1) *
+                                         eval_batch_size]
+            gnn_llm_hallucinates = detect_hallucinate(gnn_llm_pred,
+                                                      correct_answer)
+            gnn_save_list += [
+                tup for tup in zip(gnn_llm_pred, gnn_llm_hallucinates)
+            ]
+
+            if not skip_pretrained_LLM:
+                # GNN+LLM only using 32 tokens to answer.
+                # Allow more output tokens for untrained LLM
+                pure_llm_pred = pure_llm.inference(batch.question, batch.desc,
+                                                   max_tokens=256)
+                pure_llm_hallucinates = detect_hallucinate(
+                    pure_llm_pred, correct_answer)
+            else:
+                pure_llm_pred = [''] * len(gnn_llm_hallucinates)
+                pure_llm_hallucinates = [False] * len(gnn_llm_hallucinates)
+            untuned_llm_save_list += [
+                tup for tup in zip(pure_llm_pred, pure_llm_hallucinates)
+            ]
+
+            for gnn_llm_hal, pure_llm_hal in zip(gnn_llm_hallucinates,
+                                                 pure_llm_hallucinates):
+                if gnn_llm_hal == "skip" or pure_llm_hal == "skip":  # noqa
+                    # skipping when hallucination is hard to eval
+                    continue
+                gnn_llm_hallucin_sum += int(gnn_llm_hal)
+                pure_llm_hallucin_sum += int(pure_llm_hal)
+        if not skip_pretrained_LLM:
+            print("Total Pure LLM Hallucinations:", pure_llm_hallucin_sum)
+            print("Total GNN+LLM Hallucinations:", gnn_llm_hallucin_sum)
+            percent = 100.0 * round(
+                1 - (gnn_llm_hallucin_sum / pure_llm_hallucin_sum), 2)
+            print(f"GNN reduces pretrained LLM hallucinations by: ~{percent}%")
+            print("Note: hallucinations detected by regex hence the ~")
+            print("Now we see how the LLM compares when finetuned...")
+            print("Saving outputs of GNN+LLM and pretrained LLM...")
+        save_dict = {
+            "gnn_save_list": gnn_save_list,
+            "untuned_llm_save_list": untuned_llm_save_list,
+            "gnn_llm_hallucin_sum": gnn_llm_hallucin_sum,
+            "pure_llm_hallucin_sum": pure_llm_hallucin_sum
+        }
+        torch.save(save_dict, "demo_save_dict.pt")
+        print("Done!")
+    else:
+        save_dict = torch.load("demo_save_dict.pt")
+        gnn_save_list = save_dict["gnn_save_list"]
+        untuned_llm_save_list = save_dict["untuned_llm_save_list"]
+        gnn_llm_hallucin_sum = save_dict["gnn_llm_hallucin_sum"]
+        pure_llm_hallucin_sum = save_dict["pure_llm_hallucin_sum"]
+
+    trained_llm_hallucin_sum = 0
+    untuned_llm_hallucin_sum = pure_llm_hallucin_sum
+    final_prnt_str = ""
+    if path.exists("llm.pt") and path.exists("llm_eval_outs.pt"):
+        print("Existing finetuned LLM found.")
+        print("Would you like to retrain?")
+        user_input = str(input("(y/n):")).lower()
+        retrain = user_input == "y"
+    else:
+        retrain = True
+    if retrain:
+        print("Finetuning LLM...")
+        since = time.time()
+        _, _, pure_llm_eval_outputs = train(since, epochs, None, None,
+                                            batch_size, eval_batch_size, lr,
+                                            loss_fn, inference_fn,
+                                            model=pure_llm, dataset=dataset)
+        e2e_time = round(time.time() - since, 2)
+        print("E2E time (e2e_time) =", e2e_time, "seconds")
+    else:
+        pure_llm_eval_outputs = torch.load("llm_eval_outs.pt")
+    pure_llm_preds = []
+    for out in pure_llm_eval_outputs:
+        pure_llm_preds += out['pred']
+    print("Final comparison between all models...")
+    for i, batch in enumerate(tqdm(loader)):
+        question = batch.question
+        correct_answer = batch.label
+        gnn_llm_pred, gnn_llm_hallucinates = list(
+            zip(*gnn_save_list[i * eval_batch_size:(i + 1) * eval_batch_size]))
+        untuned_llm_pred, untuned_llm_hallucinates = list(
+            zip(*untuned_llm_save_list[i * eval_batch_size:(i + 1) *
+                                       eval_batch_size]))
+        pure_llm_pred = pure_llm_preds[i * eval_batch_size:(i + 1) *
+                                       eval_batch_size]
+        pure_llm_hallucinates = detect_hallucinate(pure_llm_pred,
+                                                   correct_answer)
+        for j in range(len(gnn_llm_pred)):
+            if skip_pretrained_LLM:
+                # we did not check the untrained LLM, so do not decide to demo
+                # based on this.
+                # HACK
+                untuned_llm_hallucinates = {j: True}
+            if gnn_llm_hallucinates[j] == "skip" or untuned_llm_hallucinates[
+                    j] == "skip" or pure_llm_hallucinates[j] == "skip":
+                continue
+            trained_llm_hallucin_sum += int(pure_llm_hallucinates[j])
+            if untuned_llm_hallucinates[j] and pure_llm_hallucinates[
+                    j] and not gnn_llm_hallucinates[j]:  # noqa
+                final_prnt_str += "Prompt: '" + question[j] + "'\n"
+                final_prnt_str += "Label: '" + correct_answer[j] + "'\n"
+                if not skip_pretrained_LLM:
+                    final_prnt_str += "Untuned LLM Output: '" \
+                        + untuned_llm_pred[j] + "'\n"  # noqa
+                final_prnt_str += "Tuned LLM Output: '" + pure_llm_pred[
+                    j] + "'\n"
+                final_prnt_str += "GNN+LLM Output: '" + gnn_llm_pred[j] + "'\n"
+                final_prnt_str += "\n" + "#" * 20 + "\n\n"
+    if not skip_pretrained_LLM:
+        print("Total untuned LLM Hallucinations:", untuned_llm_hallucin_sum)
+    print("Total tuned LLM Hallucinations:", trained_llm_hallucin_sum)
+    print("Total GNN+LLM Hallucinations:", gnn_llm_hallucin_sum)
+    if not skip_pretrained_LLM:
+        percent = 100.0 * round(
+            1 - (gnn_llm_hallucin_sum / untuned_llm_hallucin_sum), 2)
+        print(f"GNN reduces untuned LLM hallucinations by: ~{percent}%")
+    tuned_percent = 100.0 * round(
+        1 - (gnn_llm_hallucin_sum / trained_llm_hallucin_sum), 2)
+    print(f"GNN reduces tuned LLM hallucinations by: ~{tuned_percent}%")
+    print("Note: hallucinations detected by regex hence the ~")
+    print("Potential instances where GNN solves the hallucinations of LLM:")
+    print(final_prnt_str)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gnn_hidden_channels', type=int, default=1024)
+    parser.add_argument('--num_gnn_layers', type=int, default=4)
+    parser.add_argument('--lr', type=float, default=1e-5)
+    parser.add_argument('--epochs', type=int, default=2)
+    parser.add_argument('--batch_size', type=int, default=8)
+    parser.add_argument('--eval_batch_size', type=int, default=16)
+    parser.add_argument('--checkpointing', action='store_true')
+    parser.add_argument('--tiny_llama', action='store_true')
+    parser.add_argument(
+        "--skip_pretrained_llm_eval", action="store_true",
+        help="This flag will skip the evaluation of the pretrained LLM.")
+    args = parser.parse_args()
+
+    start_time = time.time()
+    train(
+        args.epochs,
+        args.gnn_hidden_channels,
+        args.num_gnn_layers,
+        args.batch_size,
+        args.eval_batch_size,
+        args.lr,
+        checkpointing=args.checkpointing,
+        tiny_llama=args.tiny_llama,
+    )
+    print(f"Total Time: {time.time() - start_time:2f}s")
diff --git a/examples/llm/g_retriever_utils/rag_backend_utils.py b/examples/llm/g_retriever_utils/rag_backend_utils.py
new file mode 100644
index 000000000000..0f1c0e1b87ec
--- /dev/null
+++ b/examples/llm/g_retriever_utils/rag_backend_utils.py
@@ -0,0 +1,224 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+    Protocol,
+    Tuple,
+    Type,
+    runtime_checkable,
+)
+
+import torch
+from torch import Tensor
+from torch.nn import Module
+
+from torch_geometric.data import (
+    FeatureStore,
+    GraphStore,
+    LargeGraphIndexer,
+    TripletLike,
+)
+from torch_geometric.data.large_graph_indexer import EDGE_RELATION
+from torch_geometric.distributed import (
+    LocalFeatureStore,
+    LocalGraphStore,
+    Partitioner,
+)
+from torch_geometric.typing import EdgeType, NodeType
+
+RemoteGraphBackend = Tuple[FeatureStore, GraphStore]
+
+# TODO: Make everything compatible with Hetero graphs aswell
+
+
+# Adapted from LocalGraphStore
+@runtime_checkable
+class ConvertableGraphStore(Protocol):
+    @classmethod
+    def from_data(
+        cls,
+        edge_id: Tensor,
+        edge_index: Tensor,
+        num_nodes: int,
+        is_sorted: bool = False,
+    ) -> GraphStore:
+        ...
+
+    @classmethod
+    def from_hetero_data(
+        cls,
+        edge_id_dict: Dict[EdgeType, Tensor],
+        edge_index_dict: Dict[EdgeType, Tensor],
+        num_nodes_dict: Dict[NodeType, int],
+        is_sorted: bool = False,
+    ) -> GraphStore:
+        ...
+
+    @classmethod
+    def from_partition(cls, root: str, pid: int) -> GraphStore:
+        ...
+
+
+# Adapted from LocalFeatureStore
+@runtime_checkable
+class ConvertableFeatureStore(Protocol):
+    @classmethod
+    def from_data(
+        cls,
+        node_id: Tensor,
+        x: Optional[Tensor] = None,
+        y: Optional[Tensor] = None,
+        edge_id: Optional[Tensor] = None,
+        edge_attr: Optional[Tensor] = None,
+    ) -> FeatureStore:
+        ...
+
+    @classmethod
+    def from_hetero_data(
+        cls,
+        node_id_dict: Dict[NodeType, Tensor],
+        x_dict: Optional[Dict[NodeType, Tensor]] = None,
+        y_dict: Optional[Dict[NodeType, Tensor]] = None,
+        edge_id_dict: Optional[Dict[EdgeType, Tensor]] = None,
+        edge_attr_dict: Optional[Dict[EdgeType, Tensor]] = None,
+    ) -> FeatureStore:
+        ...
+
+    @classmethod
+    def from_partition(cls, root: str, pid: int) -> FeatureStore:
+        ...
+
+
+class RemoteDataType(Enum):
+    DATA = auto()
+    PARTITION = auto()
+
+
+@dataclass
+class RemoteGraphBackendLoader:
+    """Utility class to load triplets into a RAG Backend."""
+    path: str
+    datatype: RemoteDataType
+    graph_store_type: Type[ConvertableGraphStore]
+    feature_store_type: Type[ConvertableFeatureStore]
+
+    def load(self, pid: Optional[int] = None) -> RemoteGraphBackend:
+        if self.datatype == RemoteDataType.DATA:
+            data_obj = torch.load(self.path)
+            graph_store = self.graph_store_type.from_data(
+                edge_id=data_obj['edge_id'], edge_index=data_obj.edge_index,
+                num_nodes=data_obj.num_nodes)
+            feature_store = self.feature_store_type.from_data(
+                node_id=data_obj['node_id'], x=data_obj.x,
+                edge_id=data_obj['edge_id'], edge_attr=data_obj.edge_attr)
+        elif self.datatype == RemoteDataType.PARTITION:
+            if pid is None:
+                assert pid is not None, \
+                    "Partition ID must be defined for loading from a " \
+                    + "partitioned store."
+            graph_store = self.graph_store_type.from_partition(self.path, pid)
+            feature_store = self.feature_store_type.from_partition(
+                self.path, pid)
+        else:
+            raise NotImplementedError
+        return (feature_store, graph_store)
+
+
+# TODO: make profilable
+def create_remote_backend_from_triplets(
+    triplets: Iterable[TripletLike], node_embedding_model: Module,
+    edge_embedding_model: Module | None = None,
+    graph_db: Type[ConvertableGraphStore] = LocalGraphStore,
+    feature_db: Type[ConvertableFeatureStore] = LocalFeatureStore,
+    node_method_to_call: str = "forward",
+    edge_method_to_call: str | None = None,
+    pre_transform: Callable[[TripletLike], TripletLike] | None = None,
+    path: str = '', n_parts: int = 1,
+    node_method_kwargs: Optional[Dict[str, Any]] = None,
+    edge_method_kwargs: Optional[Dict[str, Any]] = None
+) -> RemoteGraphBackendLoader:
+    """Utility function that can be used to create a RAG Backend from triplets.
+
+    Args:
+        triplets (Iterable[TripletLike]): Triplets to load into the RAG
+            Backend.
+        node_embedding_model (Module): Model to embed nodes into a feature
+            space.
+        edge_embedding_model (Module | None, optional): Model to embed edges
+            into a feature space. Defaults to the node model.
+        graph_db (Type[ConvertableGraphStore], optional): GraphStore class to
+            use. Defaults to LocalGraphStore.
+        feature_db (Type[ConvertableFeatureStore], optional): FeatureStore
+            class to use. Defaults to LocalFeatureStore.
+        node_method_to_call (str, optional): method to call for embeddings on
+            the node model. Defaults to "forward".
+        edge_method_to_call (str | None, optional): method to call for
+            embeddings on the edge model. Defaults to the node method.
+        pre_transform (Callable[[TripletLike], TripletLike] | None, optional):
+            optional preprocessing function for triplets. Defaults to None.
+        path (str, optional): path to save resulting stores. Defaults to ''.
+        n_parts (int, optional): Number of partitons to store in.
+            Defaults to 1.
+        node_method_kwargs (Optional[Dict[str, Any]], optional): args to pass
+            into node encoding method. Defaults to None.
+        edge_method_kwargs (Optional[Dict[str, Any]], optional): args to pass
+            into edge encoding method. Defaults to None.
+
+    Returns:
+        RemoteGraphBackendLoader: Loader to load RAG backend from disk or
+            memory.
+    """
+    # Will return attribute errors for missing attributes
+    if not issubclass(graph_db, ConvertableGraphStore):
+        getattr(graph_db, "from_data")
+        getattr(graph_db, "from_hetero_data")
+        getattr(graph_db, "from_partition")
+    elif not issubclass(feature_db, ConvertableFeatureStore):
+        getattr(feature_db, "from_data")
+        getattr(feature_db, "from_hetero_data")
+        getattr(feature_db, "from_partition")
+
+    # Resolve callable methods
+    node_method_kwargs = node_method_kwargs \
+        if node_method_kwargs is not None else dict()
+
+    edge_embedding_model = edge_embedding_model \
+        if edge_embedding_model is not None else node_embedding_model
+    edge_method_to_call = edge_method_to_call \
+        if edge_method_to_call is not None else node_method_to_call
+    edge_method_kwargs = edge_method_kwargs \
+        if edge_method_kwargs is not None else node_method_kwargs
+
+    # These will return AttributeErrors if they don't exist
+    node_model = getattr(node_embedding_model, node_method_to_call)
+    edge_model = getattr(edge_embedding_model, edge_method_to_call)
+
+    indexer = LargeGraphIndexer.from_triplets(triplets,
+                                              pre_transform=pre_transform)
+
+    node_feats = node_model(indexer.get_node_features(), **node_method_kwargs)
+    indexer.add_node_feature('x', node_feats)
+
+    edge_feats = edge_model(
+        indexer.get_unique_edge_features(feature_name=EDGE_RELATION),
+        **edge_method_kwargs)
+    indexer.add_edge_feature(new_feature_name="edge_attr",
+                             new_feature_vals=edge_feats,
+                             map_from_feature=EDGE_RELATION)
+
+    data = indexer.to_data(node_feature_name='x',
+                           edge_feature_name='edge_attr')
+
+    if n_parts == 1:
+        torch.save(data, path)
+        return RemoteGraphBackendLoader(path, RemoteDataType.DATA, graph_db,
+                                        feature_db)
+    else:
+        partitioner = Partitioner(data=data, num_parts=n_parts, root=path)
+        partitioner.generate_partition()
+        return RemoteGraphBackendLoader(path, RemoteDataType.PARTITION,
+                                        graph_db, feature_db)
diff --git a/examples/llm/g_retriever_utils/rag_feature_store.py b/examples/llm/g_retriever_utils/rag_feature_store.py
new file mode 100644
index 000000000000..e01e9e59bb88
--- /dev/null
+++ b/examples/llm/g_retriever_utils/rag_feature_store.py
@@ -0,0 +1,189 @@
+import gc
+from collections.abc import Iterable, Iterator
+from typing import Any, Dict, Optional, Type, Union
+
+import torch
+from torch import Tensor
+from torch.nn import Module
+from torchmetrics.functional import pairwise_cosine_similarity
+
+from torch_geometric.data import Data, HeteroData
+from torch_geometric.distributed import LocalFeatureStore
+from torch_geometric.nn.nlp import SentenceTransformer
+from torch_geometric.nn.pool import ApproxMIPSKNNIndex
+from torch_geometric.sampler import HeteroSamplerOutput, SamplerOutput
+from torch_geometric.typing import InputEdges, InputNodes
+
+
+# NOTE: Only compatible with Homogeneous graphs for now
+class KNNRAGFeatureStore(LocalFeatureStore):
+    def __init__(self, enc_model: Type[Module],
+                 model_kwargs: Optional[Dict[str,
+                                             Any]] = None, *args, **kwargs):
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")
+        self.enc_model = enc_model(*args, **kwargs).to(self.device)
+        self.enc_model.eval()
+        self.model_kwargs = \
+            model_kwargs if model_kwargs is not None else dict()
+        super().__init__()
+
+    @property
+    def x(self) -> Tensor:
+        return self.get_tensor(group_name=None, attr_name='x')
+
+    @property
+    def edge_attr(self) -> Tensor:
+        return self.get_tensor(group_name=(None, None), attr_name='edge_attr')
+
+    def retrieve_seed_nodes(self, query: Any, k_nodes: int = 5) -> InputNodes:
+        result = next(self._retrieve_seed_nodes_batch([query], k_nodes))
+        gc.collect()
+        torch.cuda.empty_cache()
+        return result
+
+    def _retrieve_seed_nodes_batch(self, query: Iterable[Any],
+                                   k_nodes: int) -> Iterator[InputNodes]:
+        if isinstance(self.meta, dict) and self.meta.get("is_hetero", False):
+            raise NotImplementedError
+
+        query_enc = self.enc_model.encode(query,
+                                          **self.model_kwargs).to(self.device)
+        prizes = pairwise_cosine_similarity(query_enc, self.x.to(self.device))
+        topk = min(k_nodes, len(self.x))
+        for q in prizes:
+            _, indices = torch.topk(q, topk, largest=True)
+            yield indices
+
+    def retrieve_seed_edges(self, query: Any, k_edges: int = 3) -> InputEdges:
+        result = next(self._retrieve_seed_edges_batch([query], k_edges))
+        gc.collect()
+        torch.cuda.empty_cache()
+        return result
+
+    def _retrieve_seed_edges_batch(self, query: Iterable[Any],
+                                   k_edges: int) -> Iterator[InputEdges]:
+        if isinstance(self.meta, dict) and self.meta.get("is_hetero", False):
+            raise NotImplementedError
+
+        query_enc = self.enc_model.encode(query,
+                                          **self.model_kwargs).to(self.device)
+
+        prizes = pairwise_cosine_similarity(query_enc,
+                                            self.edge_attr.to(self.device))
+        topk = min(k_edges, len(self.edge_attr))
+        for q in prizes:
+            _, indices = torch.topk(q, topk, largest=True)
+            yield indices
+
+    def load_subgraph(
+        self, sample: Union[SamplerOutput, HeteroSamplerOutput]
+    ) -> Union[Data, HeteroData]:
+
+        if isinstance(sample, HeteroSamplerOutput):
+            raise NotImplementedError
+
+        # NOTE: torch_geometric.loader.utils.filter_custom_store can be used
+        # here if it supported edge features
+        node_id = sample.node
+        edge_id = sample.edge
+        edge_index = torch.stack((sample.row, sample.col), dim=0)
+        x = self.x[node_id]
+        edge_attr = self.edge_attr[edge_id]
+
+        return Data(x=x, edge_index=edge_index, edge_attr=edge_attr,
+                    node_idx=node_id, edge_idx=edge_id)
+
+
+# TODO: Refactor because composition >> inheritance
+
+
+def _add_features_to_knn_index(knn_index: ApproxMIPSKNNIndex, emb: Tensor,
+                               device: torch.device, batch_size: int = 2**20):
+    """Add new features to the existing KNN index in batches.
+
+    Args:
+        knn_index (ApproxMIPSKNNIndex): Index to add features to.
+        emb (Tensor): Embeddings to add.
+        device (torch.device): Device to store in
+        batch_size (int, optional): Batch size to iterate by.
+            Defaults to 2**20, which equates to 4GB if working with
+            1024 dim floats.
+    """
+    for i in range(0, emb.size(0), batch_size):
+        if emb.size(0) - i >= batch_size:
+            emb_batch = emb[i:i + batch_size].to(device)
+        else:
+            emb_batch = emb[i:].to(device)
+        knn_index.add(emb_batch)
+
+
+class ApproxKNNRAGFeatureStore(KNNRAGFeatureStore):
+    def __init__(self, enc_model: Type[Module],
+                 model_kwargs: Optional[Dict[str,
+                                             Any]] = None, *args, **kwargs):
+        # TODO: Add kwargs for approx KNN to parameters here.
+        super().__init__(enc_model, model_kwargs, *args, **kwargs)
+        self.node_knn_index = None
+        self.edge_knn_index = None
+
+    def _retrieve_seed_nodes_batch(self, query: Iterable[Any],
+                                   k_nodes: int) -> Iterator[InputNodes]:
+        if isinstance(self.meta, dict) and self.meta.get("is_hetero", False):
+            raise NotImplementedError
+
+        enc_model = self.enc_model.to(self.device)
+        query_enc = enc_model.encode(query,
+                                     **self.model_kwargs).to(self.device)
+        del enc_model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        if self.node_knn_index is None:
+            self.node_knn_index = ApproxMIPSKNNIndex(num_cells=100,
+                                                     num_cells_to_visit=100,
+                                                     bits_per_vector=4)
+            # Need to add in batches to avoid OOM
+            _add_features_to_knn_index(self.node_knn_index, self.x,
+                                       self.device)
+
+        output = self.node_knn_index.search(query_enc, k=k_nodes)
+        yield from output.index
+
+    def _retrieve_seed_edges_batch(self, query: Iterable[Any],
+                                   k_edges: int) -> Iterator[InputEdges]:
+        if isinstance(self.meta, dict) and self.meta.get("is_hetero", False):
+            raise NotImplementedError
+
+        enc_model = self.enc_model.to(self.device)
+        query_enc = enc_model.encode(query,
+                                     **self.model_kwargs).to(self.device)
+        del enc_model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        if self.edge_knn_index is None:
+            self.edge_knn_index = ApproxMIPSKNNIndex(num_cells=100,
+                                                     num_cells_to_visit=100,
+                                                     bits_per_vector=4)
+            # Need to add in batches to avoid OOM
+            _add_features_to_knn_index(self.edge_knn_index, self.edge_attr,
+                                       self.device)
+
+        output = self.edge_knn_index.search(query_enc, k=k_edges)
+        yield from output.index
+
+
+# TODO: These two classes should be refactored
+class SentenceTransformerFeatureStore(KNNRAGFeatureStore):
+    def __init__(self, *args, **kwargs):
+        kwargs['model_name'] = kwargs.get(
+            'model_name', 'sentence-transformers/all-roberta-large-v1')
+        super().__init__(SentenceTransformer, *args, **kwargs)
+
+
+class SentenceTransformerApproxFeatureStore(ApproxKNNRAGFeatureStore):
+    def __init__(self, *args, **kwargs):
+        kwargs['model_name'] = kwargs.get(
+            'model_name', 'sentence-transformers/all-roberta-large-v1')
+        super().__init__(SentenceTransformer, *args, **kwargs)
diff --git a/examples/llm/g_retriever_utils/rag_generate.py b/examples/llm/g_retriever_utils/rag_generate.py
new file mode 100644
index 000000000000..896fbd7598b1
--- /dev/null
+++ b/examples/llm/g_retriever_utils/rag_generate.py
@@ -0,0 +1,139 @@
+# %%
+import argparse
+from itertools import chain
+from typing import Tuple
+
+import pandas as pd
+import torch
+import tqdm
+from rag_backend_utils import create_remote_backend_from_triplets
+from rag_feature_store import SentenceTransformerFeatureStore
+from rag_graph_store import NeighborSamplingRAGGraphStore
+
+from torch_geometric.data import Data
+from torch_geometric.datasets import WebQSPDataset
+from torch_geometric.datasets.web_qsp_dataset import (
+    preprocess_triplet,
+    retrieval_via_pcst,
+)
+from torch_geometric.loader import RAGQueryLoader
+from torch_geometric.nn.nlp import SentenceTransformer
+
+# %%
+parser = argparse.ArgumentParser(
+    description="""Generate new WebQSP subgraphs\n""" +
+    """NOTE: Evaluating with smaller samples may result in""" +
+    """ poorer performance for the trained models compared""" +
+    """ to untrained models.""")
+# TODO: Add more arguments for configuring rag params
+parser.add_argument("--use_pcst", action="store_true")
+parser.add_argument("--num_samples", type=int, default=4700)
+parser.add_argument("--out_file", default="subg_results.pt")
+args = parser.parse_args()
+
+# %%
+ds = WebQSPDataset("dataset", limit=args.num_samples, verbose=True,
+                   force_reload=True)
+
+# %%
+triplets = chain.from_iterable(d['graph'] for d in ds.raw_dataset)
+
+# %%
+questions = ds.raw_dataset['question']
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = SentenceTransformer(
+    model_name='sentence-transformers/all-roberta-large-v1').to(device)
+
+# %%
+fs, gs = create_remote_backend_from_triplets(
+    triplets=triplets, node_embedding_model=model,
+    node_method_to_call="encode", path="backend",
+    pre_transform=preprocess_triplet, node_method_kwargs={
+        "batch_size": 256
+    }, graph_db=NeighborSamplingRAGGraphStore,
+    feature_db=SentenceTransformerFeatureStore).load()
+
+# %%
+
+
+def apply_retrieval_via_pcst(graph: Data, query: str, topk: int = 3,
+                             topk_e: int = 3,
+                             cost_e: float = 0.5) -> Tuple[Data, str]:
+    q_emb = model.encode(query)
+    textual_nodes = ds.textual_nodes.iloc[graph["node_idx"]].reset_index()
+    textual_edges = ds.textual_edges.iloc[graph["edge_idx"]].reset_index()
+    out_graph, desc = retrieval_via_pcst(graph, q_emb, textual_nodes,
+                                         textual_edges, topk, topk_e, cost_e)
+    out_graph["desc"] = desc
+    return out_graph
+
+
+def apply_retrieval_with_text(graph: Data, query: str) -> Tuple[Data, str]:
+    textual_nodes = ds.textual_nodes.iloc[graph["node_idx"]].reset_index()
+    textual_edges = ds.textual_edges.iloc[graph["edge_idx"]].reset_index()
+    desc = (
+        textual_nodes.to_csv(index=False) + "\n" +
+        textual_edges.to_csv(index=False, columns=["src", "edge_attr", "dst"]))
+    graph["desc"] = desc
+    return graph
+
+
+transform = apply_retrieval_via_pcst \
+    if args.use_pcst else apply_retrieval_with_text
+
+query_loader = RAGQueryLoader(data=(fs, gs), seed_nodes_kwargs={"k_nodes": 5},
+                              seed_edges_kwargs={"k_edges": 5},
+                              sampler_kwargs={"num_neighbors": [50] * 2},
+                              local_filter=transform)
+
+
+# %%
+# Accuracy Metrics to be added to Profiler
+def _eidx_helper(subg: Data, ground_truth: Data):
+    subg_eidx, gt_eidx = subg.edge_idx, ground_truth.edge_idx
+    if isinstance(subg_eidx, torch.Tensor):
+        subg_eidx = subg_eidx.tolist()
+    if isinstance(gt_eidx, torch.Tensor):
+        gt_eidx = gt_eidx.tolist()
+    subg_e = set(subg_eidx)
+    gt_e = set(gt_eidx)
+    return subg_e, gt_e
+
+
+def check_retrieval_accuracy(subg: Data, ground_truth: Data, num_edges: int):
+    subg_e, gt_e = _eidx_helper(subg, ground_truth)
+    total_e = set(range(num_edges))
+    tp = len(subg_e & gt_e)
+    tn = len(total_e - (subg_e | gt_e))
+    return (tp + tn) / num_edges
+
+
+def check_retrieval_precision(subg: Data, ground_truth: Data):
+    subg_e, gt_e = _eidx_helper(subg, ground_truth)
+    return len(subg_e & gt_e) / len(subg_e)
+
+
+def check_retrieval_recall(subg: Data, ground_truth: Data):
+    subg_e, gt_e = _eidx_helper(subg, ground_truth)
+    return len(subg_e & gt_e) / len(gt_e)
+
+
+# %%
+retrieval_stats = {"precision": [], "recall": [], "accuracy": []}
+subgs = []
+node_len = []
+edge_len = []
+for subg in tqdm.tqdm(query_loader.query(q) for q in questions):
+    subgs.append(subg)
+    node_len.append(subg['x'].shape[0])
+    edge_len.append(subg['edge_attr'].shape[0])
+
+for i, subg in enumerate(subgs):
+    subg['question'] = questions[i]
+    subg['label'] = ds[i]['label']
+
+pd.DataFrame.from_dict(retrieval_stats).to_csv(
+    args.out_file.split('.')[0] + '_metadata.csv')
+torch.save(subgs, args.out_file)
diff --git a/examples/llm/g_retriever_utils/rag_graph_store.py b/examples/llm/g_retriever_utils/rag_graph_store.py
new file mode 100644
index 000000000000..48473f287233
--- /dev/null
+++ b/examples/llm/g_retriever_utils/rag_graph_store.py
@@ -0,0 +1,107 @@
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+from torch_geometric.data import FeatureStore
+from torch_geometric.distributed import LocalGraphStore
+from torch_geometric.sampler import (
+    HeteroSamplerOutput,
+    NeighborSampler,
+    NodeSamplerInput,
+    SamplerOutput,
+)
+from torch_geometric.sampler.neighbor_sampler import NumNeighborsType
+from torch_geometric.typing import EdgeTensorType, InputEdges, InputNodes
+
+
+class NeighborSamplingRAGGraphStore(LocalGraphStore):
+    def __init__(self, feature_store: Optional[FeatureStore] = None,
+                 num_neighbors: NumNeighborsType = [1], **kwargs):
+        self.feature_store = feature_store
+        self._num_neighbors = num_neighbors
+        self.sample_kwargs = kwargs
+        self._sampler_is_initialized = False
+        super().__init__()
+
+    def _init_sampler(self):
+        if self.feature_store is None:
+            raise AttributeError("Feature store not registered yet.")
+        self.sampler = NeighborSampler(data=(self.feature_store, self),
+                                       num_neighbors=self._num_neighbors,
+                                       **self.sample_kwargs)
+        self._sampler_is_initialized = True
+
+    def register_feature_store(self, feature_store: FeatureStore):
+        self.feature_store = feature_store
+        self._sampler_is_initialized = False
+
+    def put_edge_id(self, edge_id: Tensor, *args, **kwargs) -> bool:
+        ret = super().put_edge_id(edge_id.contiguous(), *args, **kwargs)
+        self._sampler_is_initialized = False
+        return ret
+
+    @property
+    def edge_index(self):
+        return self.get_edge_index(*self.edge_idx_args, **self.edge_idx_kwargs)
+
+    def put_edge_index(self, edge_index: EdgeTensorType, *args,
+                       **kwargs) -> bool:
+        ret = super().put_edge_index(edge_index, *args, **kwargs)
+        # HACK
+        self.edge_idx_args = args
+        self.edge_idx_kwargs = kwargs
+        self._sampler_is_initialized = False
+        return ret
+
+    @property
+    def num_neighbors(self):
+        return self._num_neighbors
+
+    @num_neighbors.setter
+    def num_neighbors(self, num_neighbors: NumNeighborsType):
+        self._num_neighbors = num_neighbors
+        if hasattr(self, 'sampler'):
+            self.sampler.num_neighbors = num_neighbors
+
+    def sample_subgraph(
+        self, seed_nodes: InputNodes, seed_edges: InputEdges,
+        num_neighbors: Optional[NumNeighborsType] = None
+    ) -> Union[SamplerOutput, HeteroSamplerOutput]:
+        """Sample the graph starting from the given nodes and edges using the
+        in-built NeighborSampler.
+
+        Args:
+            seed_nodes (InputNodes): Seed nodes to start sampling from.
+            seed_edges (InputEdges): Seed edges to start sampling from.
+            num_neighbors (Optional[NumNeighborsType], optional): Parameters
+                to determine how many hops and number of neighbors per hop.
+                Defaults to None.
+
+        Returns:
+            Union[SamplerOutput, HeteroSamplerOutput]: NeighborSamplerOutput
+                for the input.
+        """
+        if not self._sampler_is_initialized:
+            self._init_sampler()
+        if num_neighbors is not None:
+            self.num_neighbors = num_neighbors
+
+        # FIXME: Right now, only input nodes/edges as tensors are be supported
+        if not isinstance(seed_nodes, Tensor):
+            raise NotImplementedError
+        if not isinstance(seed_edges, Tensor):
+            raise NotImplementedError
+        device = seed_nodes.device
+
+        # TODO: Call sample_from_edges for seed_edges
+        # Turning them into nodes for now.
+        seed_edges = self.edge_index.to(device).T[seed_edges.to(
+            device)].reshape(-1)
+        seed_nodes = torch.cat((seed_nodes, seed_edges), dim=0)
+
+        seed_nodes = seed_nodes.unique().contiguous()
+        node_sample_input = NodeSamplerInput(input_id=None, node=seed_nodes)
+        out = self.sampler.sample_from_nodes(node_sample_input)
+
+        return out
diff --git a/examples/llm/multihop_rag/README.md b/examples/llm/multihop_rag/README.md
new file mode 100644
index 000000000000..ff43b16a2c05
--- /dev/null
+++ b/examples/llm/multihop_rag/README.md
@@ -0,0 +1,9 @@
+# Examples for LLM and GNN co-training
+
+| Example                                                  | Description                                                                                                                                |
+| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`multihop_download.sh`](./multihop_download.sh)         | Downloads all the components of the multihop dataset.                                                                                      |
+| [`multihop_preprocess.py`](./multihop_preprocess.py)     | Preprocesses the dataset to pair questions/answers with components in the knowledge graph. Contains documentation to describe the process. |
+| [`rag_generate_multihop.py`](./rag_generate_multihop.py) | Utilizes the sample remote backend in [`g_retriever_utils`](../g_retriever_utils/) to generate subgraphs for the multihop dataset.         |
+
+NOTE: Performance of GRetriever on this dataset has not been evaluated.
diff --git a/examples/llm/multihop_rag/multihop_download.sh b/examples/llm/multihop_rag/multihop_download.sh
new file mode 100644
index 000000000000..3c1970d39440
--- /dev/null
+++ b/examples/llm/multihop_rag/multihop_download.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# Wikidata5m
+
+wget -O "wikidata5m_alias.tar.gz" "https://www.dropbox.com/s/lnbhc8yuhit4wm5/wikidata5m_alias.tar.gz"
+tar -xvf "wikidata5m_alias.tar.gz"
+wget -O "wikidata5m_all_triplet.txt.gz" "https://www.dropbox.com/s/563omb11cxaqr83/wikidata5m_all_triplet.txt.gz"
+gzip -d "wikidata5m_all_triplet.txt.gz" -f
+
+# 2Multihopqa
+wget -O "data_ids_april7.zip" "https://www.dropbox.com/s/ms2m13252h6xubs/data_ids_april7.zip"
+unzip -o "data_ids_april7.zip"
diff --git a/examples/llm/multihop_rag/multihop_preprocess.py b/examples/llm/multihop_rag/multihop_preprocess.py
new file mode 100644
index 000000000000..46052bdf1b15
--- /dev/null
+++ b/examples/llm/multihop_rag/multihop_preprocess.py
@@ -0,0 +1,276 @@
+"""Example workflow for downloading and assembling a multihop QA dataset."""
+
+import argparse
+import json
+from subprocess import call
+
+import pandas as pd
+import torch
+import tqdm
+
+from torch_geometric.data import LargeGraphIndexer
+
+# %% [markdown]
+# # Encoding A Large Knowledge Graph Part 2
+
+# %% [markdown]
+# In this notebook, we will continue where we left off by building a new
+# multi-hop QA dataset based on Wikidata.
+
+# %% [markdown]
+# ## Example 2: Building a new Dataset from Questions and an already-existing
+# Knowledge Graph
+
+# %% [markdown]
+# ### Motivation
+
+# %% [markdown]
+# One potential application of knowledge graph structural encodings is
+# capturing the relationships between different entities that are multiple
+# hops apart. This can be challenging for an LLM to recognize from prepended
+# graph information. Here's a motivating example (credit to @Rishi Puri):
+
+# %% [markdown]
+# In this example, the question can only be answered by reasoning about the
+# relationships between the entities in the knowledge graph.
+
+# %% [markdown]
+# ### Building a Multi-Hop QA Dataset
+
+# %% [markdown]
+# To start, we need to download the raw data of a knowledge graph.
+# In this case, we use WikiData5M
+# ([Wang et al]
+# (https://paperswithcode.com/paper/kepler-a-unified-model-for-knowledge)).
+# Here we download the raw triplets and their entity codes. Information about
+# this dataset can be found
+# [here](https://deepgraphlearning.github.io/project/wikidata5m).
+
+# %% [markdown]
+# The following download contains the ID to plaintext mapping for all the
+# entities and relations in the knowledge graph:
+
+rv = call("./multihop_download.sh")
+
+# %% [markdown]
+# To start, we are going to preprocess the knowledge graph to substitute each
+# of the entity/relation codes with their plaintext aliases. This makes it
+# easier to use a pre-trained textual encoding model to create triplet
+# embeddings, as such a model likely won't understand how to properly embed
+# the entity codes.
+
+# %%
+
+# %%
+parser = argparse.ArgumentParser(description="Preprocess wikidata5m")
+parser.add_argument("--n_triplets", type=int, default=-1)
+args = parser.parse_args()
+
+# %%
+# Substitute entity codes with their aliases
+# Picking the first alias for each entity (rather arbitrarily)
+alias_map = {}
+rel_alias_map = {}
+for line in open('wikidata5m_entity.txt'):
+    parts = line.strip().split('\t')
+    entity_id = parts[0]
+    aliases = parts[1:]
+    alias_map[entity_id] = aliases[0]
+for line in open('wikidata5m_relation.txt'):
+    parts = line.strip().split('\t')
+    relation_id = parts[0]
+    relation_name = parts[1]
+    rel_alias_map[relation_id] = relation_name
+
+# %%
+full_graph = []
+missing_total = 0
+total = 0
+limit = None if args.n_triplets == -1 else args.n_triplets
+i = 0
+
+for line in tqdm.tqdm(open('wikidata5m_all_triplet.txt')):
+    if limit is not None and i >= limit:
+        break
+    src, rel, dst = line.strip().split('\t')
+    if src not in alias_map:
+        missing_total += 1
+    if dst not in alias_map:
+        missing_total += 1
+    if rel not in rel_alias_map:
+        missing_total += 1
+    total += 3
+    full_graph.append([
+        alias_map.get(src, src),
+        rel_alias_map.get(rel, rel),
+        alias_map.get(dst, dst)
+    ])
+    i += 1
+print(f"Missing aliases: {missing_total}/{total}")
+
+# %% [markdown]
+# Now `full_graph` represents the knowledge graph triplets in
+# understandable plaintext.
+
+# %% [markdown]
+# Next, we need a set of multi-hop questions that the Knowledge Graph will
+# provide us with context for. We utilize a subset of
+# [HotPotQA](https://hotpotqa.github.io/)
+# ([Yang et. al.](https://arxiv.org/pdf/1809.09600)) called
+# [2WikiMultiHopQA](https://github.com/Alab-NII/2wikimultihop)
+# ([Ho et. al.](https://aclanthology.org/2020.coling-main.580.pdf)),
+# which includes a subgraph of entities that serve as the ground truth
+# justification for answering each multi-hop question:
+
+# %%
+with open('train.json') as f:
+    train_data = json.load(f)
+train_df = pd.DataFrame(train_data)
+train_df['split_type'] = 'train'
+
+with open('dev.json') as f:
+    dev_data = json.load(f)
+dev_df = pd.DataFrame(dev_data)
+dev_df['split_type'] = 'dev'
+
+with open('test.json') as f:
+    test_data = json.load(f)
+test_df = pd.DataFrame(test_data)
+test_df['split_type'] = 'test'
+
+df = pd.concat([train_df, dev_df, test_df])
+
+# %% [markdown]
+# Now we need to extract the subgraphs
+
+# %%
+df['graph_size'] = df['evidences_id'].apply(lambda row: len(row))
+
+# %% [markdown]
+# (Optional) We take only questions where the evidence graph is greater than
+# 0. (Note: this gets rid of the test set):
+
+# %%
+# df = df[df['graph_size'] > 0]
+
+# %%
+refined_df = df[[
+    '_id', 'question', 'answer', 'split_type', 'evidences_id', 'type',
+    'graph_size'
+]]
+
+# %% [markdown]
+# Checkpoint:
+
+# %%
+refined_df.to_csv('wikimultihopqa_refined.csv', index=False)
+
+# %% [markdown]
+# Now we need to check that all the entities mentioned in the question/answer
+# set are also present in the Wikidata graph:
+
+# %%
+relation_map = {}
+with open('wikidata5m_relation.txt') as f:
+    for line in tqdm.tqdm(f):
+        parts = line.strip().split('\t')
+        for i in range(1, len(parts)):
+            if parts[i] not in relation_map:
+                relation_map[parts[i]] = []
+            relation_map[parts[i]].append(parts[0])
+
+# %%
+entity_set = set()
+with open('wikidata5m_entity.txt') as f:
+    for line in tqdm.tqdm(f):
+        entity_set.add(line.strip().split('\t')[0])
+
+# %%
+missing_entities = set()
+missing_entity_idx = set()
+for i, row in enumerate(refined_df.itertuples()):
+    for trip in row.evidences_id:
+        entities = trip[0], trip[2]
+        for entity in entities:
+            if entity not in entity_set:
+                # print(
+                #    f'The following entity was not found in the KG: {entity}'
+                #    )
+                missing_entities.add(entity)
+                missing_entity_idx.add(i)
+
+# %% [markdown]
+# Right now, we drop the missing entity entries. Additional preprocessing can
+# be done here to resolve the entity/relation collisions, but that is out of
+# the scope for this notebook.
+
+# %%
+# missing relations are ok, but missing entities cannot be mapped to
+# plaintext, so they should be dropped.
+refined_df.reset_index(inplace=True, drop=True)
+
+# %%
+cleaned_df = refined_df.drop(missing_entity_idx)
+
+# %% [markdown]
+# Now we save the resulting graph and questions/answers dataset:
+
+# %%
+cleaned_df.to_csv('wikimultihopqa_cleaned.csv', index=False)
+
+# %%
+
+# %%
+torch.save(full_graph, 'wikimultihopqa_full_graph.pt')
+
+# %% [markdown]
+# ### Question: How do we extract a contextual subgraph for a given query?
+
+# %% [markdown]
+# The chosen retrieval algorithm is a critical component in the pipeline for
+# affecting RAG performance. In the next section (1), we will demonstrate a
+# naive method of retrieval for a large knowledge graph, and how to apply it
+# to this dataset along with WebQSP.
+
+# %% [markdown]
+# ### Preparing a Textualized Graph for LLM
+
+# %% [markdown]
+# For now however, we need to prepare the graph data to be used as a plaintext
+# prefix to the LLM. In order to do this, we want to prompt the LLM to use the
+# unique nodes, and unique edge triplets of a given subgraph. In order to do
+# this, we prepare a unique indexed node df and edge df for the knowledge
+# graph now. This process occurs trivially with the LargeGraphIndexer:
+
+# %%
+
+# %%
+indexer = LargeGraphIndexer.from_triplets(full_graph)
+
+# %%
+# Node DF
+textual_nodes = pd.DataFrame.from_dict(
+    {"node_attr": indexer.get_node_features()})
+textual_nodes["node_id"] = textual_nodes.index
+textual_nodes = textual_nodes[["node_id", "node_attr"]]
+
+# %% [markdown]
+# Notice how LargeGraphIndexer ensures that there are no duplicate indices:
+
+# %%
+# Edge DF
+textual_edges = pd.DataFrame(indexer.get_edge_features(),
+                             columns=["src", "edge_attr", "dst"])
+textual_edges["src"] = [indexer._nodes[h] for h in textual_edges["src"]]
+textual_edges["dst"] = [indexer._nodes[h] for h in textual_edges["dst"]]
+
+# %% [markdown]
+# Note: The edge table refers to each node by its index in the node table.
+# We will see how this gets utilized later when indexing a subgraph.
+
+# %% [markdown]
+# Now we can save the result
+
+# %%
+textual_nodes.to_csv('wikimultihopqa_textual_nodes.csv', index=False)
+textual_edges.to_csv('wikimultihopqa_textual_edges.csv', index=False)
diff --git a/examples/llm/multihop_rag/rag_generate_multihop.py b/examples/llm/multihop_rag/rag_generate_multihop.py
new file mode 100644
index 000000000000..de93a9e75dd1
--- /dev/null
+++ b/examples/llm/multihop_rag/rag_generate_multihop.py
@@ -0,0 +1,88 @@
+# %%
+import argparse
+import sys
+from typing import Tuple
+
+import pandas as pd
+import torch
+import tqdm
+
+from torch_geometric.data import Data
+from torch_geometric.datasets.web_qsp_dataset import (
+    preprocess_triplet,
+    retrieval_via_pcst,
+)
+from torch_geometric.loader import RAGQueryLoader
+from torch_geometric.nn.nlp import SentenceTransformer
+
+sys.path.append('..')
+
+from g_retriever_utils.rag_backend_utils import \
+    create_remote_backend_from_triplets  # noqa: E402
+from g_retriever_utils.rag_feature_store import \
+    SentenceTransformerApproxFeatureStore  # noqa: E402
+from g_retriever_utils.rag_graph_store import \
+    NeighborSamplingRAGGraphStore  # noqa: E402
+
+# %%
+parser = argparse.ArgumentParser(
+    description="Generate new multihop dataset for rag")
+# TODO: Add more arguments for configuring rag params
+parser.add_argument("--num_samples", type=int)
+args = parser.parse_args()
+
+# %%
+triplets = torch.load('wikimultihopqa_full_graph.pt')
+
+# %%
+df = pd.read_csv('wikimultihopqa_cleaned.csv')
+questions = df['question'][:args.num_samples]
+labels = df['answer'][:args.num_samples]
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = SentenceTransformer(
+    model_name='sentence-transformers/all-roberta-large-v1').to(device)
+
+# %%
+fs, gs = create_remote_backend_from_triplets(
+    triplets=triplets, node_embedding_model=model,
+    node_method_to_call="encode", path="backend",
+    pre_transform=preprocess_triplet, node_method_kwargs={
+        "batch_size": 256
+    }, graph_db=NeighborSamplingRAGGraphStore,
+    feature_db=SentenceTransformerApproxFeatureStore).load()
+
+# %%
+
+all_textual_nodes = pd.read_csv('wikimultihopqa_textual_nodes.csv')
+all_textual_edges = pd.read_csv('wikimultihopqa_textual_edges.csv')
+
+
+def apply_retrieval_via_pcst(graph: Data, query: str, topk: int = 3,
+                             topk_e: int = 3,
+                             cost_e: float = 0.5) -> Tuple[Data, str]:
+    q_emb = model.encode(query)
+    textual_nodes = all_textual_nodes.iloc[graph["node_idx"]].reset_index()
+    textual_edges = all_textual_edges.iloc[graph["edge_idx"]].reset_index()
+    out_graph, desc = retrieval_via_pcst(graph, q_emb, textual_nodes,
+                                         textual_edges, topk, topk_e, cost_e)
+    out_graph["desc"] = desc
+    return out_graph
+
+
+# %%
+query_loader = RAGQueryLoader(data=(fs, gs), seed_nodes_kwargs={"k_nodes": 10},
+                              seed_edges_kwargs={"k_edges": 10},
+                              sampler_kwargs={"num_neighbors": [40] * 3},
+                              local_filter=apply_retrieval_via_pcst)
+
+# %%
+subgs = []
+for q, l in tqdm.tqdm(zip(questions, labels)):
+    subg = query_loader.query(q)
+    subg['question'] = q
+    subg['label'] = l
+    subgs.append(subg)
+
+torch.save(subgs, 'subg_results.pt')
diff --git a/examples/llm/nvtx_examples/README.md b/examples/llm/nvtx_examples/README.md
new file mode 100644
index 000000000000..aa4f070d9824
--- /dev/null
+++ b/examples/llm/nvtx_examples/README.md
@@ -0,0 +1,7 @@
+# Examples for LLM and GNN co-training
+
+| Example                                                        | Description                                                                                                                     |
+| -------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| [`nvtx_run.sh`](./nvtx_run.sh)                                 | Runs nsys profiler on a given Python file that contains NVTX calls.                                                             |
+| [`nvtx_rag_backend_example.py`](./nvtx_rag_backend_example.py) | Example script for nsys profiling a RAG Backend such as that used in [`rag_generate.py`](../g_retriever_utils/rag_generate.py). |
+| [`nvtx_webqsp_example.py`](./nvtx_webqsp_example.py)           | Example script for nsys profiling the WebQSP dataset.                                                                           |
diff --git a/examples/llm/nvtx_examples/nvtx_rag_backend_example.py b/examples/llm/nvtx_examples/nvtx_rag_backend_example.py
new file mode 100644
index 000000000000..b30e34b8c7b1
--- /dev/null
+++ b/examples/llm/nvtx_examples/nvtx_rag_backend_example.py
@@ -0,0 +1,144 @@
+# %%
+import argparse
+import sys
+from itertools import chain
+from typing import Tuple
+
+import torch
+
+from torch_geometric.data import Data, get_features_for_triplets_groups
+from torch_geometric.datasets import WebQSPDataset
+from torch_geometric.datasets.web_qsp_dataset import (
+    preprocess_triplet,
+    retrieval_via_pcst,
+)
+from torch_geometric.loader import rag_loader
+from torch_geometric.nn.nlp import SentenceTransformer
+from torch_geometric.profile.nvtx import nvtxit
+
+sys.path.append('..')
+from g_retriever_utils.rag_backend_utils import \
+    create_remote_backend_from_triplets  # noqa: E402
+from g_retriever_utils.rag_feature_store import \
+    SentenceTransformerFeatureStore  # noqa: E402
+from g_retriever_utils.rag_graph_store import \
+    NeighborSamplingRAGGraphStore  # noqa: E402
+
+# %%
+# Patch FeatureStore and GraphStore
+
+SentenceTransformerFeatureStore.retrieve_seed_nodes = nvtxit()(
+    SentenceTransformerFeatureStore.retrieve_seed_nodes)
+SentenceTransformerFeatureStore.retrieve_seed_edges = nvtxit()(
+    SentenceTransformerFeatureStore.retrieve_seed_edges)
+SentenceTransformerFeatureStore.load_subgraph = nvtxit()(
+    SentenceTransformerFeatureStore.load_subgraph)
+NeighborSamplingRAGGraphStore.sample_subgraph = nvtxit()(
+    NeighborSamplingRAGGraphStore.sample_subgraph)
+rag_loader.RAGQueryLoader.query = nvtxit()(rag_loader.RAGQueryLoader.query)
+
+# %%
+ds = WebQSPDataset("small_ds_1", force_reload=True, limit=10)
+
+# %%
+triplets = list(chain.from_iterable(d['graph'] for d in ds.raw_dataset))
+
+# %%
+questions = ds.raw_dataset['question']
+
+# %%
+ground_truth_graphs = get_features_for_triplets_groups(
+    ds.indexer, (d['graph'] for d in ds.raw_dataset),
+    pre_transform=preprocess_triplet)
+num_edges = len(ds.indexer._edges)
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = SentenceTransformer('sentence-transformers/all-roberta-large-v1').to(
+    device)
+
+# %%
+fs, gs = create_remote_backend_from_triplets(
+    triplets=triplets, node_embedding_model=model,
+    node_method_to_call="encode", path="backend",
+    pre_transform=preprocess_triplet, node_method_kwargs={
+        "batch_size": 256
+    }, graph_db=NeighborSamplingRAGGraphStore,
+    feature_db=SentenceTransformerFeatureStore).load()
+
+# %%
+
+
+@nvtxit()
+def apply_retrieval_via_pcst(graph: Data, query: str, topk: int = 3,
+                             topk_e: int = 3,
+                             cost_e: float = 0.5) -> Tuple[Data, str]:
+    q_emb = model.encode(query)
+    textual_nodes = ds.textual_nodes.iloc[graph["node_idx"]].reset_index()
+    textual_edges = ds.textual_edges.iloc[graph["edge_idx"]].reset_index()
+    out_graph, desc = retrieval_via_pcst(graph, q_emb, textual_nodes,
+                                         textual_edges, topk, topk_e, cost_e)
+    out_graph["desc"] = desc
+    return graph
+
+
+# %%
+query_loader = rag_loader.RAGQueryLoader(
+    data=(fs, gs), seed_nodes_kwargs={"k_nodes":
+                                      10}, seed_edges_kwargs={"k_edges": 10},
+    sampler_kwargs={"num_neighbors":
+                    [40] * 10}, local_filter=apply_retrieval_via_pcst)
+
+
+# %%
+# Accuracy Metrics to be added to Profiler
+def _eidx_helper(subg: Data, ground_truth: Data):
+    subg_eidx, gt_eidx = subg.edge_idx, ground_truth.edge_idx
+    if isinstance(subg_eidx, torch.Tensor):
+        subg_eidx = subg_eidx.tolist()
+    if isinstance(gt_eidx, torch.Tensor):
+        gt_eidx = gt_eidx.tolist()
+    subg_e = set(subg_eidx)
+    gt_e = set(gt_eidx)
+    return subg_e, gt_e
+
+
+def check_retrieval_accuracy(subg: Data, ground_truth: Data, num_edges: int):
+    subg_e, gt_e = _eidx_helper(subg, ground_truth)
+    total_e = set(range(num_edges))
+    tp = len(subg_e & gt_e)
+    tn = len(total_e - (subg_e | gt_e))
+    return (tp + tn) / num_edges
+
+
+def check_retrieval_precision(subg: Data, ground_truth: Data):
+    subg_e, gt_e = _eidx_helper(subg, ground_truth)
+    return len(subg_e & gt_e) / len(subg_e)
+
+
+def check_retrieval_recall(subg: Data, ground_truth: Data):
+    subg_e, gt_e = _eidx_helper(subg, ground_truth)
+    return len(subg_e & gt_e) / len(gt_e)
+
+
+# %%
+
+
+@nvtxit()
+def _run_eval():
+    for subg, gt in zip((query_loader.query(q) for q in questions),
+                        ground_truth_graphs):
+        print(check_retrieval_accuracy(subg, gt, num_edges),
+              check_retrieval_precision(subg, gt),
+              check_retrieval_recall(subg, gt))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--capture-torch-kernels", "-k", action="store_true")
+    args = parser.parse_args()
+    if args.capture_torch_kernels:
+        with torch.autograd.profiler.emit_nvtx():
+            _run_eval()
+    else:
+        _run_eval()
diff --git a/examples/llm/nvtx_examples/nvtx_run.sh b/examples/llm/nvtx_examples/nvtx_run.sh
new file mode 100644
index 000000000000..4c6fce7c8224
--- /dev/null
+++ b/examples/llm/nvtx_examples/nvtx_run.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+# Check if the user provided a Python file
+if [ -z "$1" ]; then
+    echo "Usage: $0 <python_file>"
+    exit 1
+fi
+
+# Check if the provided file exists
+if [[ ! -f "$1" ]]; then
+    echo "Error: File '$1' does not exist."
+    exit 1
+fi
+
+# Check if the provided file is a Python file
+if [[ ! "$1" == *.py ]]; then
+    echo "Error: '$1' is not a Python file."
+    exit 1
+fi
+
+# Get the base name of the Python file
+python_file=$(basename "$1")
+
+# Run nsys profile on the Python file
+nsys profile -c cudaProfilerApi --capture-range-end repeat -t cuda,nvtx,osrt,cudnn,cublas --cuda-memory-usage true --cudabacktrace all --force-overwrite true --output=profile_${python_file%.py} python "$1"
+
+echo "Profile data saved as profile_${python_file%.py}.nsys-rep"
diff --git a/examples/llm/nvtx_examples/nvtx_webqsp_example.py b/examples/llm/nvtx_examples/nvtx_webqsp_example.py
new file mode 100644
index 000000000000..5a9aad27f1c0
--- /dev/null
+++ b/examples/llm/nvtx_examples/nvtx_webqsp_example.py
@@ -0,0 +1,22 @@
+import argparse
+
+import torch
+
+from torch_geometric.datasets import web_qsp_dataset
+from torch_geometric.profile import nvtxit
+
+# Apply Patches
+web_qsp_dataset.retrieval_via_pcst = nvtxit()(
+    web_qsp_dataset.retrieval_via_pcst)
+web_qsp_dataset.WebQSPDataset.process = nvtxit()(
+    web_qsp_dataset.WebQSPDataset.process)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--capture-torch-kernels", "-k", action="store_true")
+    args = parser.parse_args()
+    if args.capture_torch_kernels:
+        with torch.autograd.profiler.emit_nvtx():
+            ds = web_qsp_dataset.WebQSPDataset('baseline', split='val')
+    else:
+        ds = web_qsp_dataset.WebQSPDataset('baseline', split='val')
diff --git a/test/data/test_large_graph_indexer.py b/test/data/test_large_graph_indexer.py
new file mode 100644
index 000000000000..b98fe7d7ddbf
--- /dev/null
+++ b/test/data/test_large_graph_indexer.py
@@ -0,0 +1,177 @@
+import random
+import string
+from typing import List
+
+import pytest
+import torch
+
+from torch_geometric.data import (
+    Data,
+    LargeGraphIndexer,
+    TripletLike,
+    get_features_for_triplets,
+)
+from torch_geometric.data.large_graph_indexer import (
+    EDGE_PID,
+    EDGE_RELATION,
+    NODE_PID,
+)
+from torch_geometric.typing import WITH_PT20
+
+# create possible nodes and edges for graph
+strkeys = string.ascii_letters + string.digits
+NODE_POOL = list({"".join(random.sample(strkeys, 10)) for i in range(1000)})
+EDGE_POOL = list({"".join(random.sample(strkeys, 10)) for i in range(50)})
+
+
+def featurize(s: str) -> int:
+    return int.from_bytes(s.encode(), 'little')
+
+
+def sample_triplets(amount: int = 1) -> List[TripletLike]:
+    trips = []
+    for i in range(amount):
+        h, t = random.sample(NODE_POOL, k=2)
+        r = random.sample(EDGE_POOL, k=1)[0]
+        trips.append(tuple([h, r, t]))
+    return trips
+
+
+def preprocess_triplet(triplet: TripletLike) -> TripletLike:
+    h, r, t = triplet
+    return h.lower(), r, t.lower()
+
+
+def test_basic_collate():
+    graphs = [sample_triplets(1000) for i in range(2)]
+
+    indexer_0 = LargeGraphIndexer.from_triplets(
+        graphs[0], pre_transform=preprocess_triplet)
+    indexer_1 = LargeGraphIndexer.from_triplets(
+        graphs[1], pre_transform=preprocess_triplet)
+
+    big_indexer = LargeGraphIndexer.collate([indexer_0, indexer_1])
+
+    assert len(indexer_0._nodes) + len(
+        indexer_1._nodes) - len(indexer_0._nodes.keys()
+                                & indexer_1._nodes.keys()) == len(
+                                    big_indexer._nodes)
+    assert len(indexer_0._edges) + len(
+        indexer_1._edges) - len(indexer_0._edges.keys()
+                                & indexer_1._edges.keys()) == len(
+                                    big_indexer._edges)
+
+    assert len(set(big_indexer._nodes.values())) == len(big_indexer._nodes)
+    assert len(set(big_indexer._edges.values())) == len(big_indexer._edges)
+
+    for node in (indexer_0._nodes.keys() | indexer_1._nodes.keys()):
+        assert big_indexer.node_attr[NODE_PID][
+            big_indexer._nodes[node]] == node
+
+
+def test_large_graph_index():
+    graphs = [sample_triplets(1000) for i in range(100)]
+
+    # Preprocessing of trips lowercases nodes but not edges
+    node_feature_vecs = {s.lower(): featurize(s.lower()) for s in NODE_POOL}
+    edge_feature_vecs = {s: featurize(s) for s in EDGE_POOL}
+
+    def encode_graph_from_trips(triplets: List[TripletLike]) -> Data:
+        seen_nodes = dict()
+        edge_attrs = list()
+        edge_idx = []
+        for trip in triplets:
+            trip = preprocess_triplet(trip)
+            h, r, t = trip
+            seen_nodes[h] = len(
+                seen_nodes) if h not in seen_nodes else seen_nodes[h]
+            seen_nodes[t] = len(
+                seen_nodes) if t not in seen_nodes else seen_nodes[t]
+            edge_attrs.append(edge_feature_vecs[r])
+            edge_idx.append((seen_nodes[h], seen_nodes[t]))
+
+        x = torch.Tensor([node_feature_vecs[n] for n in seen_nodes.keys()])
+        edge_idx = torch.LongTensor(edge_idx).T
+        edge_attrs = torch.Tensor(edge_attrs)
+        return Data(x=x, edge_index=edge_idx, edge_attr=edge_attrs)
+
+    naive_graph_ds = [
+        encode_graph_from_trips(triplets=trips) for trips in graphs
+    ]
+
+    indexer = LargeGraphIndexer.collate([
+        LargeGraphIndexer.from_triplets(g, pre_transform=preprocess_triplet)
+        for g in graphs
+    ])
+    indexer_nodes = indexer.get_unique_node_features()
+    indexer_node_vals = torch.Tensor(
+        [node_feature_vecs[n] for n in indexer_nodes])
+    indexer_edges = indexer.get_unique_edge_features(
+        feature_name=EDGE_RELATION)
+    indexer_edge_vals = torch.Tensor(
+        [edge_feature_vecs[e] for e in indexer_edges])
+    indexer.add_node_feature('x', indexer_node_vals)
+    indexer.add_edge_feature('edge_attr', indexer_edge_vals,
+                             map_from_feature=EDGE_RELATION)
+    large_graph_ds = [
+        get_features_for_triplets(indexer=indexer, triplets=g,
+                                  node_feature_name='x',
+                                  edge_feature_name='edge_attr',
+                                  pre_transform=preprocess_triplet)
+        for g in graphs
+    ]
+
+    for ds in large_graph_ds:
+        assert NODE_PID in ds
+        assert EDGE_PID in ds
+        assert "node_idx" in ds
+        assert "edge_idx" in ds
+
+    def results_are_close_enough(ground_truth: Data, new_method: Data,
+                                 thresh=.99):
+        def _sorted_tensors_are_close(tensor1, tensor2):
+            return torch.all(
+                torch.isclose(tensor1.sort()[0],
+                              tensor2.sort()[0]) > thresh)
+
+        def _graphs_are_same(tensor1, tensor2):
+            if not WITH_PT20:
+                pytest.skip(
+                    "This test requires a PyG version with NetworkX as a " +
+                    "dependency.")
+            import networkx as nx
+            return nx.weisfeiler_lehman_graph_hash(nx.Graph(
+                tensor1.T)) == nx.weisfeiler_lehman_graph_hash(
+                    nx.Graph(tensor2.T))
+            return True
+        return _sorted_tensors_are_close(
+            ground_truth.x, new_method.x) \
+            and _sorted_tensors_are_close(
+                ground_truth.edge_attr, new_method.edge_attr) \
+            and _graphs_are_same(
+                ground_truth.edge_index, new_method.edge_index)
+
+    for dsets in zip(naive_graph_ds, large_graph_ds):
+        assert results_are_close_enough(*dsets)
+
+
+def test_save_load(tmp_path):
+    graph = sample_triplets(1000)
+
+    node_feature_vecs = {s: featurize(s) for s in NODE_POOL}
+    edge_feature_vecs = {s: featurize(s) for s in EDGE_POOL}
+
+    indexer = LargeGraphIndexer.from_triplets(graph)
+    indexer_nodes = indexer.get_unique_node_features()
+    indexer_node_vals = torch.Tensor(
+        [node_feature_vecs[n] for n in indexer_nodes])
+    indexer_edges = indexer.get_unique_edge_features(
+        feature_name=EDGE_RELATION)
+    indexer_edge_vals = torch.Tensor(
+        [edge_feature_vecs[e] for e in indexer_edges])
+    indexer.add_node_feature('x', indexer_node_vals)
+    indexer.add_edge_feature('edge_attr', indexer_edge_vals,
+                             map_from_feature=EDGE_RELATION)
+
+    indexer.save(str(tmp_path))
+    assert indexer == LargeGraphIndexer.from_disk(str(tmp_path))
diff --git a/test/nn/models/test_g_retriever.py b/test/nn/models/test_g_retriever.py
index 899e70730cc9..24a74d1b6f6e 100644
--- a/test/nn/models/test_g_retriever.py
+++ b/test/nn/models/test_g_retriever.py
@@ -51,3 +51,52 @@ def test_g_retriever() -> None:
     # Test inference:
     pred = model.inference(question, x, edge_index, batch, edge_attr)
     assert len(pred) == 1
+
+
+@onlyFullTest
+@withPackage('transformers', 'sentencepiece', 'accelerate')
+def test_g_retriever_many_tokens() -> None:
+    llm = LLM(
+        model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+        num_params=1,
+        dtype=torch.float16,
+    )
+
+    gnn = GAT(
+        in_channels=1024,
+        out_channels=1024,
+        hidden_channels=1024,
+        num_layers=2,
+        heads=4,
+        norm='batch_norm',
+    )
+
+    model = GRetriever(
+        llm=llm,
+        gnn=gnn,
+        mlp_out_channels=2048,
+        mlp_out_tokens=2,
+    )
+    assert str(model) == ('GRetriever(\n'
+                          '  llm=LLM(TinyLlama/TinyLlama-1.1B-Chat-v0.1),\n'
+                          '  gnn=GAT(1024, 1024, num_layers=2),\n'
+                          ')')
+
+    x = torch.randn(10, 1024)
+    edge_index = torch.tensor([
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 0],
+    ])
+    edge_attr = torch.randn(edge_index.size(1), 1024)
+    batch = torch.zeros(x.size(0), dtype=torch.long)
+
+    question = ["Is PyG the best open-source GNN library?"]
+    label = ["yes!"]
+
+    # Test train:
+    loss = model(question, x, edge_index, batch, label, edge_attr)
+    assert loss >= 0
+
+    # Test inference:
+    pred = model.inference(question, x, edge_index, batch, edge_attr)
+    assert len(pred) == 1
diff --git a/test/profile/test_nvtx.py b/test/profile/test_nvtx.py
new file mode 100644
index 000000000000..56e28a9c2e59
--- /dev/null
+++ b/test/profile/test_nvtx.py
@@ -0,0 +1,136 @@
+from unittest.mock import call, patch
+
+from torch_geometric.profile import nvtxit
+
+
+def _setup_mock(torch_cuda_mock):
+    torch_cuda_mock.is_available.return_value = True
+    torch_cuda_mock.cudart.return_value.cudaProfilerStart.return_value = None
+    torch_cuda_mock.cudart.return_value.cudaProfilerStop.return_value = None
+    return torch_cuda_mock
+
+
+@patch('torch_geometric.profile.nvtx.torch.cuda')
+def test_nvtxit_base(torch_cuda_mock):
+    torch_cuda_mock = _setup_mock(torch_cuda_mock)
+
+    # dummy func calls a calls b
+
+    @nvtxit()
+    def call_b():
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+        return 42
+
+    @nvtxit()
+    def call_a():
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+        return call_b()
+
+    def dummy_func():
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+        return call_a()
+
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+    dummy_func()
+
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 1  # noqa: E501
+    assert torch_cuda_mock.nvtx.range_push.call_args_list == [
+        call('call_a_0'), call('call_b_0')
+    ]
+
+
+@patch('torch_geometric.profile.nvtx.torch.cuda')
+def test_nvtxit_rename(torch_cuda_mock):
+    torch_cuda_mock = _setup_mock(torch_cuda_mock)
+
+    # dummy func calls a calls b
+
+    @nvtxit()
+    def call_b():
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+        return 42
+
+    @nvtxit('a_nvtx')
+    def call_a():
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+        return call_b()
+
+    def dummy_func():
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+        assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+        return call_a()
+
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+    dummy_func()
+
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 1  # noqa: E501
+    assert torch_cuda_mock.nvtx.range_push.call_args_list == [
+        call('a_nvtx_0'), call('call_b_0')
+    ]
+
+
+@patch('torch_geometric.profile.nvtx.torch.cuda')
+def test_nvtxit_iters(torch_cuda_mock):
+    torch_cuda_mock = _setup_mock(torch_cuda_mock)
+
+    # dummy func calls a calls b
+
+    @nvtxit(n_iters=1)
+    def call_b():
+        return 42
+
+    @nvtxit()
+    def call_a():
+        return call_b()
+
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+
+    call_b()
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 1  # noqa: E501
+    call_a()
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 2  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 2  # noqa: E501
+
+    assert torch_cuda_mock.nvtx.range_push.call_args_list == [
+        call('call_b_0'), call('call_a_0')
+    ]
+
+
+@patch('torch_geometric.profile.nvtx.torch.cuda')
+def test_nvtxit_warmups(torch_cuda_mock):
+    torch_cuda_mock = _setup_mock(torch_cuda_mock)
+
+    # dummy func calls a calls b
+
+    @nvtxit(n_warmups=1)
+    def call_b():
+        return 42
+
+    @nvtxit()
+    def call_a():
+        return call_b()
+
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+
+    call_b()
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 0  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 0  # noqa: E501
+    call_a()
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStart.call_count == 1  # noqa: E501
+    assert torch_cuda_mock.cudart.return_value.cudaProfilerStop.call_count == 1  # noqa: E501
+
+    assert torch_cuda_mock.nvtx.range_push.call_args_list == [
+        call('call_a_0'), call('call_b_1')
+    ]
diff --git a/torch_geometric/data/__init__.py b/torch_geometric/data/__init__.py
index 821ef9c5c063..fee215b1a357 100644
--- a/torch_geometric/data/__init__.py
+++ b/torch_geometric/data/__init__.py
@@ -16,6 +16,7 @@
 from .makedirs import makedirs
 from .download import download_url, download_google_url
 from .extract import extract_tar, extract_zip, extract_bz2, extract_gz
+from .large_graph_indexer import LargeGraphIndexer, TripletLike, get_features_for_triplets, get_features_for_triplets_groups
 
 from torch_geometric.lazy_loader import LazyLoader
 
@@ -27,6 +28,8 @@
     'Dataset',
     'InMemoryDataset',
     'OnDiskDataset',
+    'LargeGraphIndexer',
+    'TripletLike',
 ]
 
 remote_backend_classes = [
@@ -50,6 +53,8 @@
     'extract_zip',
     'extract_bz2',
     'extract_gz',
+    'get_features_for_triplets',
+    "get_features_for_triplets_groups",
 ]
 
 __all__ = data_classes + remote_backend_classes + helper_functions
diff --git a/torch_geometric/data/large_graph_indexer.py b/torch_geometric/data/large_graph_indexer.py
new file mode 100644
index 000000000000..0644e2543303
--- /dev/null
+++ b/torch_geometric/data/large_graph_indexer.py
@@ -0,0 +1,677 @@
+import os
+import pickle as pkl
+import shutil
+from dataclasses import dataclass
+from itertools import chain
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Hashable,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+from torch import Tensor
+from tqdm import tqdm
+
+from torch_geometric.data import Data
+from torch_geometric.typing import WITH_PT24
+
+TripletLike = Tuple[Hashable, Hashable, Hashable]
+
+KnowledgeGraphLike = Iterable[TripletLike]
+
+
+def ordered_set(values: Iterable[Hashable]) -> List[Hashable]:
+    return list(dict.fromkeys(values))
+
+
+# TODO: Refactor Node and Edge funcs and attrs to be accessible via an Enum?
+
+NODE_PID = "pid"
+
+NODE_KEYS = {NODE_PID}
+
+EDGE_PID = "e_pid"
+EDGE_HEAD = "h"
+EDGE_RELATION = "r"
+EDGE_TAIL = "t"
+EDGE_INDEX = "edge_idx"
+
+EDGE_KEYS = {EDGE_PID, EDGE_HEAD, EDGE_RELATION, EDGE_TAIL, EDGE_INDEX}
+
+FeatureValueType = Union[Sequence[Any], Tensor]
+
+
+@dataclass
+class MappedFeature:
+    name: str
+    values: FeatureValueType
+
+    def __eq__(self, value: "MappedFeature") -> bool:
+        eq = self.name == value.name
+        if isinstance(self.values, torch.Tensor):
+            eq &= torch.equal(self.values, value.values)
+        else:
+            eq &= self.values == value.values
+        return eq
+
+
+if WITH_PT24:
+    torch.serialization.add_safe_globals([MappedFeature])
+
+
+class LargeGraphIndexer:
+    """For a dataset that consists of mulitiple subgraphs that are assumed to
+    be part of a much larger graph, collate the values into a large graph store
+    to save resources.
+    """
+    def __init__(
+        self,
+        nodes: Iterable[Hashable],
+        edges: KnowledgeGraphLike,
+        node_attr: Optional[Dict[str, List[Any]]] = None,
+        edge_attr: Optional[Dict[str, List[Any]]] = None,
+    ) -> None:
+        r"""Constructs a new index that uniquely catalogs each node and edge
+        by id. Not meant to be used directly.
+
+        Args:
+            nodes (Iterable[Hashable]): Node ids in the graph.
+            edges (KnowledgeGraphLike): Edge ids in the graph.
+            node_attr (Optional[Dict[str, List[Any]]], optional): Mapping node
+                attribute name and list of their values in order of unique node
+                ids. Defaults to None.
+            edge_attr (Optional[Dict[str, List[Any]]], optional): Mapping edge
+                attribute name and list of their values in order of unique edge
+                ids. Defaults to None.
+        """
+        self._nodes: Dict[Hashable, int] = dict()
+        self._edges: Dict[TripletLike, int] = dict()
+
+        self._mapped_node_features: Set[str] = set()
+        self._mapped_edge_features: Set[str] = set()
+
+        if len(nodes) != len(set(nodes)):
+            raise AttributeError("Nodes need to be unique")
+        if len(edges) != len(set(edges)):
+            raise AttributeError("Edges need to be unique")
+
+        if node_attr is not None:
+            # TODO: Validity checks btw nodes and node_attr
+            self.node_attr = node_attr
+            if NODE_KEYS & set(self.node_attr.keys()) != NODE_KEYS:
+                raise AttributeError(
+                    "Invalid node_attr object. Missing " +
+                    f"{NODE_KEYS - set(self.node_attr.keys())}")
+            elif self.node_attr[NODE_PID] != nodes:
+                raise AttributeError(
+                    "Nodes provided do not match those in node_attr")
+        else:
+            self.node_attr = dict()
+            self.node_attr[NODE_PID] = nodes
+
+        for i, node in enumerate(self.node_attr[NODE_PID]):
+            self._nodes[node] = i
+
+        if edge_attr is not None:
+            # TODO: Validity checks btw edges and edge_attr
+            self.edge_attr = edge_attr
+
+            if EDGE_KEYS & set(self.edge_attr.keys()) != EDGE_KEYS:
+                raise AttributeError(
+                    "Invalid edge_attr object. Missing " +
+                    f"{EDGE_KEYS - set(self.edge_attr.keys())}")
+            elif self.node_attr[EDGE_PID] != edges:
+                raise AttributeError(
+                    "Edges provided do not match those in edge_attr")
+
+        else:
+            self.edge_attr = dict()
+            for default_key in EDGE_KEYS:
+                self.edge_attr[default_key] = list()
+            self.edge_attr[EDGE_PID] = edges
+
+            for i, tup in enumerate(edges):
+                h, r, t = tup
+                self.edge_attr[EDGE_HEAD].append(h)
+                self.edge_attr[EDGE_RELATION].append(r)
+                self.edge_attr[EDGE_TAIL].append(t)
+                self.edge_attr[EDGE_INDEX].append(
+                    (self._nodes[h], self._nodes[t]))
+
+        for i, tup in enumerate(edges):
+            self._edges[tup] = i
+
+    @classmethod
+    def from_triplets(
+        cls,
+        triplets: KnowledgeGraphLike,
+        pre_transform: Optional[Callable[[TripletLike], TripletLike]] = None,
+    ) -> "LargeGraphIndexer":
+        r"""Generate a new index from a series of triplets that represent edge
+        relations between nodes.
+        Formatted like (source_node, edge, dest_node).
+
+        Args:
+            triplets (KnowledgeGraphLike): Series of triplets representing
+                knowledge graph relations.
+            pre_transform (Optional[Callable[[TripletLike], TripletLike]]):
+                Optional preprocessing function to apply to triplets.
+                Defaults to None.
+
+        Returns:
+            LargeGraphIndexer: Index of unique nodes and edges.
+        """
+        # NOTE: Right now assumes that all trips can be loaded into memory
+        nodes = set()
+        edges = set()
+
+        if pre_transform is not None:
+
+            def apply_transform(
+                    trips: KnowledgeGraphLike) -> Iterator[TripletLike]:
+                for trip in trips:
+                    yield pre_transform(trip)
+
+            triplets = apply_transform(triplets)
+
+        for h, r, t in triplets:
+
+            for node in (h, t):
+                nodes.add(node)
+
+            edge_idx = (h, r, t)
+            edges.add(edge_idx)
+
+        return cls(list(nodes), list(edges))
+
+    @classmethod
+    def collate(cls,
+                graphs: Iterable["LargeGraphIndexer"]) -> "LargeGraphIndexer":
+        r"""Combines a series of large graph indexes into a single large graph
+        index.
+
+        Args:
+            graphs (Iterable[&quot;LargeGraphIndexer&quot;]): Indices to be
+                combined.
+
+        Returns:
+            LargeGraphIndexer: Singular unique index for all nodes and edges
+                in input indices.
+        """
+        # FIXME Needs to merge node attrs and edge attrs?
+        trips = chain.from_iterable([graph.to_triplets() for graph in graphs])
+        return cls.from_triplets(trips)
+
+    def get_unique_node_features(
+            self, feature_name: str = NODE_PID) -> List[Hashable]:
+        r"""Get all the unique values for a specific node attribute.
+
+        Args:
+            feature_name (str, optional): Name of feature to get.
+                Defaults to NODE_PID.
+
+        Returns:
+            List[Hashable]: List of unique values for the specified feature.
+        """
+        try:
+            if feature_name in self._mapped_node_features:
+                raise IndexError(
+                    "Only non-mapped features can be retrieved uniquely.")
+            return ordered_set(self.get_node_features(feature_name))
+
+        except KeyError:
+            raise AttributeError(
+                f"Nodes do not have a feature called {feature_name}")
+
+    def add_node_feature(
+        self,
+        new_feature_name: str,
+        new_feature_vals: FeatureValueType,
+        map_from_feature: str = NODE_PID,
+    ) -> None:
+        r"""Adds a new feature that corresponds to each unique node in
+            the graph.
+
+        Args:
+            new_feature_name (str): Name to call the new feature.
+            new_feature_vals (FeatureValueType): Values to map for that
+                new feature.
+            map_from_feature (str, optional): Key of feature to map from.
+                Size must match the number of feature values.
+                Defaults to NODE_PID.
+        """
+        if new_feature_name in self.node_attr:
+            raise AttributeError("Features cannot be overridden once created")
+        if map_from_feature in self._mapped_node_features:
+            raise AttributeError(
+                f"{map_from_feature} is already a feature mapping.")
+
+        feature_keys = self.get_unique_node_features(map_from_feature)
+        if len(feature_keys) != len(new_feature_vals):
+            raise AttributeError(
+                "Expected encodings for {len(feature_keys)} unique features," +
+                f" but got {len(new_feature_vals)} encodings.")
+
+        if map_from_feature == NODE_PID:
+            self.node_attr[new_feature_name] = new_feature_vals
+        else:
+            self.node_attr[new_feature_name] = MappedFeature(
+                name=map_from_feature, values=new_feature_vals)
+            self._mapped_node_features.add(new_feature_name)
+
+    def get_node_features(
+        self,
+        feature_name: str = NODE_PID,
+        pids: Optional[Iterable[Hashable]] = None,
+    ) -> List[Any]:
+        r"""Get node feature values for a given set of unique node ids.
+            Returned values are not necessarily unique.
+
+        Args:
+            feature_name (str, optional): Name of feature to fetch. Defaults
+                to NODE_PID.
+            pids (Optional[Iterable[Hashable]], optional): Node ids to fetch
+                for. Defaults to None, which fetches all nodes.
+
+        Returns:
+            List[Any]: Node features corresponding to the specified ids.
+        """
+        if feature_name in self._mapped_node_features:
+            values = self.node_attr[feature_name].values
+        else:
+            values = self.node_attr[feature_name]
+
+        # TODO: torch_geometric.utils.select
+        if isinstance(values, torch.Tensor):
+            idxs = list(
+                self.get_node_features_iter(feature_name, pids,
+                                            index_only=True))
+            return values[idxs]
+        return list(self.get_node_features_iter(feature_name, pids))
+
+    def get_node_features_iter(
+        self,
+        feature_name: str = NODE_PID,
+        pids: Optional[Iterable[Hashable]] = None,
+        index_only: bool = False,
+    ) -> Iterator[Any]:
+        """Iterator version of get_node_features. If index_only is True,
+        yields indices instead of values.
+        """
+        if pids is None:
+            pids = self.node_attr[NODE_PID]
+
+        if feature_name in self._mapped_node_features:
+            feature_map_info = self.node_attr[feature_name]
+            from_feature_name, to_feature_vals = (
+                feature_map_info.name,
+                feature_map_info.values,
+            )
+            from_feature_vals = self.get_unique_node_features(
+                from_feature_name)
+            feature_mapping = {k: i for i, k in enumerate(from_feature_vals)}
+
+            for pid in pids:
+                idx = self._nodes[pid]
+                from_feature_val = self.node_attr[from_feature_name][idx]
+                to_feature_idx = feature_mapping[from_feature_val]
+                if index_only:
+                    yield to_feature_idx
+                else:
+                    yield to_feature_vals[to_feature_idx]
+        else:
+            for pid in pids:
+                idx = self._nodes[pid]
+                if index_only:
+                    yield idx
+                else:
+                    yield self.node_attr[feature_name][idx]
+
+    def get_unique_edge_features(
+            self, feature_name: str = EDGE_PID) -> List[Hashable]:
+        r"""Get all the unique values for a specific edge attribute.
+
+        Args:
+            feature_name (str, optional): Name of feature to get.
+                Defaults to EDGE_PID.
+
+        Returns:
+            List[Hashable]: List of unique values for the specified feature.
+        """
+        try:
+            if feature_name in self._mapped_edge_features:
+                raise IndexError(
+                    "Only non-mapped features can be retrieved uniquely.")
+            return ordered_set(self.get_edge_features(feature_name))
+        except KeyError:
+            raise AttributeError(
+                f"Edges do not have a feature called {feature_name}")
+
+    def add_edge_feature(
+        self,
+        new_feature_name: str,
+        new_feature_vals: FeatureValueType,
+        map_from_feature: str = EDGE_PID,
+    ) -> None:
+        r"""Adds a new feature that corresponds to each unique edge in
+        the graph.
+
+        Args:
+            new_feature_name (str): Name to call the new feature.
+            new_feature_vals (FeatureValueType): Values to map for that new
+                feature.
+            map_from_feature (str, optional): Key of feature to map from.
+                Size must match the number of feature values.
+                Defaults to EDGE_PID.
+        """
+        if new_feature_name in self.edge_attr:
+            raise AttributeError("Features cannot be overridden once created")
+        if map_from_feature in self._mapped_edge_features:
+            raise AttributeError(
+                f"{map_from_feature} is already a feature mapping.")
+
+        feature_keys = self.get_unique_edge_features(map_from_feature)
+        if len(feature_keys) != len(new_feature_vals):
+            raise AttributeError(
+                f"Expected encodings for {len(feature_keys)} unique features, "
+                + f"but got {len(new_feature_vals)} encodings.")
+
+        if map_from_feature == EDGE_PID:
+            self.edge_attr[new_feature_name] = new_feature_vals
+        else:
+            self.edge_attr[new_feature_name] = MappedFeature(
+                name=map_from_feature, values=new_feature_vals)
+            self._mapped_edge_features.add(new_feature_name)
+
+    def get_edge_features(
+        self,
+        feature_name: str = EDGE_PID,
+        pids: Optional[Iterable[Hashable]] = None,
+    ) -> List[Any]:
+        r"""Get edge feature values for a given set of unique edge ids.
+            Returned values are not necessarily unique.
+
+        Args:
+            feature_name (str, optional): Name of feature to fetch.
+                Defaults to EDGE_PID.
+            pids (Optional[Iterable[Hashable]], optional): Edge ids to fetch
+                for. Defaults to None, which fetches all edges.
+
+        Returns:
+            List[Any]: Node features corresponding to the specified ids.
+        """
+        if feature_name in self._mapped_edge_features:
+            values = self.edge_attr[feature_name].values
+        else:
+            values = self.edge_attr[feature_name]
+
+        # TODO: torch_geometric.utils.select
+        if isinstance(values, torch.Tensor):
+            idxs = list(
+                self.get_edge_features_iter(feature_name, pids,
+                                            index_only=True))
+            return values[idxs]
+        return list(self.get_edge_features_iter(feature_name, pids))
+
+    def get_edge_features_iter(
+        self,
+        feature_name: str = EDGE_PID,
+        pids: Optional[KnowledgeGraphLike] = None,
+        index_only: bool = False,
+    ) -> Iterator[Any]:
+        """Iterator version of get_edge_features. If index_only is True,
+        yields indices instead of values.
+        """
+        if pids is None:
+            pids = self.edge_attr[EDGE_PID]
+
+        if feature_name in self._mapped_edge_features:
+            feature_map_info = self.edge_attr[feature_name]
+            from_feature_name, to_feature_vals = (
+                feature_map_info.name,
+                feature_map_info.values,
+            )
+            from_feature_vals = self.get_unique_edge_features(
+                from_feature_name)
+            feature_mapping = {k: i for i, k in enumerate(from_feature_vals)}
+
+            for pid in pids:
+                idx = self._edges[pid]
+                from_feature_val = self.edge_attr[from_feature_name][idx]
+                to_feature_idx = feature_mapping[from_feature_val]
+                if index_only:
+                    yield to_feature_idx
+                else:
+                    yield to_feature_vals[to_feature_idx]
+        else:
+            for pid in pids:
+                idx = self._edges[pid]
+                if index_only:
+                    yield idx
+                else:
+                    yield self.edge_attr[feature_name][idx]
+
+    def to_triplets(self) -> Iterator[TripletLike]:
+        return iter(self.edge_attr[EDGE_PID])
+
+    def save(self, path: str) -> None:
+        if os.path.exists(path):
+            shutil.rmtree(path)
+        os.makedirs(path, exist_ok=True)
+        with open(path + "/edges", "wb") as f:
+            pkl.dump(self._edges, f)
+        with open(path + "/nodes", "wb") as f:
+            pkl.dump(self._nodes, f)
+
+        with open(path + "/mapped_edges", "wb") as f:
+            pkl.dump(self._mapped_edge_features, f)
+        with open(path + "/mapped_nodes", "wb") as f:
+            pkl.dump(self._mapped_node_features, f)
+
+        node_attr_path = path + "/node_attr"
+        os.makedirs(node_attr_path, exist_ok=True)
+        for attr_name, vals in self.node_attr.items():
+            torch.save(vals, node_attr_path + f"/{attr_name}.pt")
+
+        edge_attr_path = path + "/edge_attr"
+        os.makedirs(edge_attr_path, exist_ok=True)
+        for attr_name, vals in self.edge_attr.items():
+            torch.save(vals, edge_attr_path + f"/{attr_name}.pt")
+
+    @classmethod
+    def from_disk(cls, path: str) -> "LargeGraphIndexer":
+        indexer = cls(list(), list())
+        with open(path + "/edges", "rb") as f:
+            indexer._edges = pkl.load(f)
+        with open(path + "/nodes", "rb") as f:
+            indexer._nodes = pkl.load(f)
+
+        with open(path + "/mapped_edges", "rb") as f:
+            indexer._mapped_edge_features = pkl.load(f)
+        with open(path + "/mapped_nodes", "rb") as f:
+            indexer._mapped_node_features = pkl.load(f)
+
+        node_attr_path = path + "/node_attr"
+        for fname in os.listdir(node_attr_path):
+            full_fname = f"{node_attr_path}/{fname}"
+            key = fname.split(".")[0]
+            indexer.node_attr[key] = torch.load(full_fname)
+
+        edge_attr_path = path + "/edge_attr"
+        for fname in os.listdir(edge_attr_path):
+            full_fname = f"{edge_attr_path}/{fname}"
+            key = fname.split(".")[0]
+            indexer.edge_attr[key] = torch.load(full_fname)
+
+        return indexer
+
+    def to_data(self, node_feature_name: str,
+                edge_feature_name: Optional[str] = None) -> Data:
+        """Return a Data object containing all the specified node and
+            edge features and the graph.
+
+        Args:
+            node_feature_name (str): Feature to use for nodes
+            edge_feature_name (Optional[str], optional): Feature to use for
+                edges. Defaults to None.
+
+        Returns:
+            Data: Data object containing the specified node and
+                edge features and the graph.
+        """
+        x = torch.Tensor(self.get_node_features(node_feature_name))
+        node_id = torch.LongTensor(range(len(x)))
+
+        edge_index = torch.t(
+            torch.LongTensor(self.get_edge_features(EDGE_INDEX)))
+
+        edge_attr = (self.get_edge_features(edge_feature_name)
+                     if edge_feature_name is not None else None)
+        edge_id = torch.LongTensor(range(len(edge_attr)))
+
+        return Data(x=x, edge_index=edge_index, edge_attr=edge_attr,
+                    edge_id=edge_id, node_id=node_id)
+
+    def __eq__(self, value: "LargeGraphIndexer") -> bool:
+        eq = True
+        eq &= self._nodes == value._nodes
+        eq &= self._edges == value._edges
+        eq &= self.node_attr.keys() == value.node_attr.keys()
+        eq &= self.edge_attr.keys() == value.edge_attr.keys()
+        eq &= self._mapped_node_features == value._mapped_node_features
+        eq &= self._mapped_edge_features == value._mapped_edge_features
+
+        for k in self.node_attr:
+            eq &= isinstance(self.node_attr[k], type(value.node_attr[k]))
+            if isinstance(self.node_attr[k], torch.Tensor):
+                eq &= torch.equal(self.node_attr[k], value.node_attr[k])
+            else:
+                eq &= self.node_attr[k] == value.node_attr[k]
+        for k in self.edge_attr:
+            eq &= isinstance(self.edge_attr[k], type(value.edge_attr[k]))
+            if isinstance(self.edge_attr[k], torch.Tensor):
+                eq &= torch.equal(self.edge_attr[k], value.edge_attr[k])
+            else:
+                eq &= self.edge_attr[k] == value.edge_attr[k]
+        return eq
+
+
+def get_features_for_triplets_groups(
+    indexer: LargeGraphIndexer,
+    triplet_groups: Iterable[KnowledgeGraphLike],
+    node_feature_name: str = "x",
+    edge_feature_name: str = "edge_attr",
+    pre_transform: Optional[Callable[[TripletLike], TripletLike]] = None,
+    verbose: bool = False,
+) -> Iterator[Data]:
+    """Given an indexer and a series of triplet groups (like a dataset),
+    retrieve the specified node and edge features for each triplet from the
+    index.
+
+    Args:
+        indexer (LargeGraphIndexer): Indexer containing desired features
+        triplet_groups (Iterable[KnowledgeGraphLike]): List of lists of
+            triplets to fetch features for
+        node_feature_name (str, optional): Node feature to fetch.
+            Defaults to "x".
+        edge_feature_name (str, optional): edge feature to fetch.
+            Defaults to "edge_attr".
+        pre_transform (Optional[Callable[[TripletLike], TripletLike]]):
+            Optional preprocessing to perform on triplets.
+            Defaults to None.
+        verbose (bool, optional): Whether to print progress. Defaults to False.
+
+    Yields:
+        Iterator[Data]: For each triplet group, yield a data object containing
+            the unique graph and features from the index.
+    """
+    if pre_transform is not None:
+
+        def apply_transform(trips):
+            for trip in trips:
+                yield pre_transform(tuple(trip))
+
+        # TODO: Make this safe for large amounts of triplets?
+        triplet_groups = (list(apply_transform(triplets))
+                          for triplets in triplet_groups)
+
+    node_keys = []
+    edge_keys = []
+    edge_index = []
+
+    for triplets in tqdm(triplet_groups, disable=not verbose):
+        small_graph_indexer = LargeGraphIndexer.from_triplets(
+            triplets, pre_transform=pre_transform)
+
+        node_keys.append(small_graph_indexer.get_node_features())
+        edge_keys.append(small_graph_indexer.get_edge_features(pids=triplets))
+        edge_index.append(
+            small_graph_indexer.get_edge_features(EDGE_INDEX, triplets))
+
+    node_feats = indexer.get_node_features(feature_name=node_feature_name,
+                                           pids=chain.from_iterable(node_keys))
+    edge_feats = indexer.get_edge_features(feature_name=edge_feature_name,
+                                           pids=chain.from_iterable(edge_keys))
+
+    last_node_idx, last_edge_idx = 0, 0
+    for (nkeys, ekeys, eidx) in zip(node_keys, edge_keys, edge_index):
+        nlen, elen = len(nkeys), len(ekeys)
+        x = torch.Tensor(node_feats[last_node_idx:last_node_idx + nlen])
+        last_node_idx += len(nkeys)
+
+        edge_attr = torch.Tensor(edge_feats[last_edge_idx:last_edge_idx +
+                                            elen])
+        last_edge_idx += len(ekeys)
+
+        edge_idx = torch.LongTensor(eidx).T
+
+        data_obj = Data(x=x, edge_attr=edge_attr, edge_index=edge_idx)
+        data_obj[NODE_PID] = node_keys
+        data_obj[EDGE_PID] = edge_keys
+        data_obj["node_idx"] = [indexer._nodes[k] for k in nkeys]
+        data_obj["edge_idx"] = [indexer._edges[e] for e in ekeys]
+
+        yield data_obj
+
+
+def get_features_for_triplets(
+    indexer: LargeGraphIndexer,
+    triplets: KnowledgeGraphLike,
+    node_feature_name: str = "x",
+    edge_feature_name: str = "edge_attr",
+    pre_transform: Optional[Callable[[TripletLike], TripletLike]] = None,
+    verbose: bool = False,
+) -> Data:
+    """For a given set of triplets retrieve a Data object containing the
+        unique graph and features from the index.
+
+    Args:
+        indexer (LargeGraphIndexer): Indexer containing desired features
+        triplets (KnowledgeGraphLike): Triplets to fetch features for
+        node_feature_name (str, optional): Feature to use for node features.
+            Defaults to "x".
+        edge_feature_name (str, optional): Feature to use for edge features.
+            Defaults to "edge_attr".
+        pre_transform (Optional[Callable[[TripletLike], TripletLike]]):
+            Optional preprocessing function for triplets. Defaults to None.
+        verbose (bool, optional): Whether to print progress. Defaults to False.
+
+    Returns:
+        Data: Data object containing the unique graph and features from the
+            index for the given triplets.
+    """
+    gen = get_features_for_triplets_groups(indexer, [triplets],
+                                           node_feature_name,
+                                           edge_feature_name, pre_transform,
+                                           verbose)
+    return next(gen)
diff --git a/torch_geometric/loader/__init__.py b/torch_geometric/loader/__init__.py
index 266f498a113b..7e83c35befb6 100644
--- a/torch_geometric/loader/__init__.py
+++ b/torch_geometric/loader/__init__.py
@@ -22,6 +22,7 @@
 from .prefetch import PrefetchLoader
 from .cache import CachedLoader
 from .mixin import AffinityMixin
+from .rag_loader import RAGQueryLoader
 
 __all__ = classes = [
     'DataLoader',
@@ -50,6 +51,7 @@
     'PrefetchLoader',
     'CachedLoader',
     'AffinityMixin',
+    'RAGQueryLoader',
 ]
 
 RandomNodeSampler = deprecated(
diff --git a/torch_geometric/loader/rag_loader.py b/torch_geometric/loader/rag_loader.py
new file mode 100644
index 000000000000..33d6cf0e868e
--- /dev/null
+++ b/torch_geometric/loader/rag_loader.py
@@ -0,0 +1,106 @@
+from abc import abstractmethod
+from typing import Any, Callable, Dict, Optional, Protocol, Tuple, Union
+
+from torch_geometric.data import Data, FeatureStore, HeteroData
+from torch_geometric.sampler import HeteroSamplerOutput, SamplerOutput
+from torch_geometric.typing import InputEdges, InputNodes
+
+
+class RAGFeatureStore(Protocol):
+    """Feature store for remote GNN RAG backend."""
+    @abstractmethod
+    def retrieve_seed_nodes(self, query: Any, **kwargs) -> InputNodes:
+        """Makes a comparison between the query and all the nodes to get all
+        the closest nodes. Return the indices of the nodes that are to be seeds
+        for the RAG Sampler.
+        """
+        ...
+
+    @abstractmethod
+    def retrieve_seed_edges(self, query: Any, **kwargs) -> InputEdges:
+        """Makes a comparison between the query and all the edges to get all
+        the closest nodes. Returns the edge indices that are to be the seeds
+        for the RAG Sampler.
+        """
+        ...
+
+    @abstractmethod
+    def load_subgraph(
+        self, sample: Union[SamplerOutput, HeteroSamplerOutput]
+    ) -> Union[Data, HeteroData]:
+        """Combines sampled subgraph output with features in a Data object."""
+        ...
+
+
+class RAGGraphStore(Protocol):
+    """Graph store for remote GNN RAG backend."""
+    @abstractmethod
+    def sample_subgraph(self, seed_nodes: InputNodes, seed_edges: InputEdges,
+                        **kwargs) -> Union[SamplerOutput, HeteroSamplerOutput]:
+        """Sample a subgraph using the seeded nodes and edges."""
+        ...
+
+    @abstractmethod
+    def register_feature_store(self, feature_store: FeatureStore):
+        """Register a feature store to be used with the sampler. Samplers need
+        info from the feature store in order to work properly on HeteroGraphs.
+        """
+        ...
+
+
+# TODO: Make compatible with Heterographs
+
+
+class RAGQueryLoader:
+    def __init__(self, data: Tuple[RAGFeatureStore, RAGGraphStore],
+                 local_filter: Optional[Callable[[Data, Any], Data]] = None,
+                 seed_nodes_kwargs: Optional[Dict[str, Any]] = None,
+                 seed_edges_kwargs: Optional[Dict[str, Any]] = None,
+                 sampler_kwargs: Optional[Dict[str, Any]] = None,
+                 loader_kwargs: Optional[Dict[str, Any]] = None):
+        """Loader meant for making queries from a remote backend.
+
+        Args:
+            data (Tuple[RAGFeatureStore, RAGGraphStore]): Remote FeatureStore
+                and GraphStore to load from. Assumed to conform to the
+                protocols listed above.
+            local_filter (Optional[Callable[[Data, Any], Data]], optional):
+                Optional local transform to apply to data after retrieval.
+                Defaults to None.
+            seed_nodes_kwargs (Optional[Dict[str, Any]], optional): Paramaters
+                to pass into process for fetching seed nodes. Defaults to None.
+            seed_edges_kwargs (Optional[Dict[str, Any]], optional): Parameters
+                to pass into process for fetching seed edges. Defaults to None.
+            sampler_kwargs (Optional[Dict[str, Any]], optional): Parameters to
+                pass into process for sampling graph. Defaults to None.
+            loader_kwargs (Optional[Dict[str, Any]], optional): Parameters to
+                pass into process for loading graph features. Defaults to None.
+        """
+        fstore, gstore = data
+        self.feature_store = fstore
+        self.graph_store = gstore
+        self.graph_store.register_feature_store(self.feature_store)
+        self.local_filter = local_filter
+        self.seed_nodes_kwargs = seed_nodes_kwargs or {}
+        self.seed_edges_kwargs = seed_edges_kwargs or {}
+        self.sampler_kwargs = sampler_kwargs or {}
+        self.loader_kwargs = loader_kwargs or {}
+
+    def query(self, query: Any) -> Data:
+        """Retrieve a subgraph associated with the query with all its feature
+        attributes.
+        """
+        seed_nodes = self.feature_store.retrieve_seed_nodes(
+            query, **self.seed_nodes_kwargs)
+        seed_edges = self.feature_store.retrieve_seed_edges(
+            query, **self.seed_edges_kwargs)
+
+        subgraph_sample = self.graph_store.sample_subgraph(
+            seed_nodes, seed_edges, **self.sampler_kwargs)
+
+        data = self.feature_store.load_subgraph(sample=subgraph_sample,
+                                                **self.loader_kwargs)
+
+        if self.local_filter:
+            data = self.local_filter(data, query)
+        return data
diff --git a/torch_geometric/nn/models/g_retriever.py b/torch_geometric/nn/models/g_retriever.py
index 6f8fbcc644dc..f7529ae721b7 100644
--- a/torch_geometric/nn/models/g_retriever.py
+++ b/torch_geometric/nn/models/g_retriever.py
@@ -21,6 +21,8 @@ class GRetriever(torch.nn.Module):
             (default: :obj:`False`)
         mlp_out_channels (int, optional): The size of each graph embedding
             after projection. (default: :obj:`4096`)
+        mlp_out_tokens (int, optional): Number of LLM prefix tokens to
+            reserve for GNN output. (default: :obj:`1`)
 
     .. warning::
         This module has been tested with the following HuggingFace models
@@ -43,6 +45,7 @@ def __init__(
         gnn: torch.nn.Module,
         use_lora: bool = False,
         mlp_out_channels: int = 4096,
+        mlp_out_tokens: int = 1,
     ) -> None:
         super().__init__()
 
@@ -77,7 +80,9 @@ def __init__(
         self.projector = torch.nn.Sequential(
             torch.nn.Linear(mlp_hidden_channels, mlp_hidden_channels),
             torch.nn.Sigmoid(),
-            torch.nn.Linear(mlp_hidden_channels, mlp_out_channels),
+            torch.nn.Linear(mlp_hidden_channels,
+                            mlp_out_channels * mlp_out_tokens),
+            torch.nn.Unflatten(-1, (mlp_out_tokens, mlp_out_channels)),
         ).to(self.llm.device)
 
     def encode(
@@ -126,6 +131,9 @@ def forward(
         x = self.projector(x)
         xs = x.split(1, dim=0)
 
+        # Handle case where theres more than one embedding for each sample
+        xs = [x.squeeze(0) for x in xs]
+
         # Handle questions without node features:
         batch_unique = batch.unique()
         batch_size = len(question)
@@ -182,6 +190,9 @@ def inference(
         x = self.projector(x)
         xs = x.split(1, dim=0)
 
+        # Handle case where theres more than one embedding for each sample
+        xs = [x.squeeze(0) for x in xs]
+
         # Handle questions without node features:
         batch_unique = batch.unique()
         batch_size = len(question)
diff --git a/torch_geometric/profile/__init__.py b/torch_geometric/profile/__init__.py
index 833ee657d0e7..22d3039f4c83 100644
--- a/torch_geometric/profile/__init__.py
+++ b/torch_geometric/profile/__init__.py
@@ -20,6 +20,7 @@
     get_gpu_memory_from_nvidia_smi,
     get_model_size,
 )
+from .nvtx import nvtxit
 
 __all__ = [
     'profileit',
@@ -38,6 +39,7 @@
     'get_gpu_memory_from_nvidia_smi',
     'get_gpu_memory_from_ipex',
     'benchmark',
+    'nvtxit',
 ]
 
 classes = __all__
diff --git a/torch_geometric/profile/nvtx.py b/torch_geometric/profile/nvtx.py
new file mode 100644
index 000000000000..8dbce375ae5a
--- /dev/null
+++ b/torch_geometric/profile/nvtx.py
@@ -0,0 +1,66 @@
+from functools import wraps
+from typing import Optional
+
+import torch
+
+CUDA_PROFILE_STARTED = False
+
+
+def begin_cuda_profile():
+    global CUDA_PROFILE_STARTED
+    prev_state = CUDA_PROFILE_STARTED
+    if prev_state is False:
+        CUDA_PROFILE_STARTED = True
+        torch.cuda.cudart().cudaProfilerStart()
+    return prev_state
+
+
+def end_cuda_profile(prev_state: bool):
+    global CUDA_PROFILE_STARTED
+    CUDA_PROFILE_STARTED = prev_state
+    if prev_state is False:
+        torch.cuda.cudart().cudaProfilerStop()
+
+
+def nvtxit(name: Optional[str] = None, n_warmups: int = 0,
+           n_iters: Optional[int] = None):
+    """Enables NVTX profiling for a function.
+
+    Args:
+        name (Optional[str], optional): Name to give the reference frame for
+            the function being wrapped. Defaults to the name of the
+            function in code.
+        n_warmups (int, optional): Number of iters to call that function
+            before starting. Defaults to 0.
+        n_iters (Optional[int], optional): Number of iters of that function to
+            record. Defaults to all of them.
+    """
+    def nvtx(func):
+
+        nonlocal name
+        iters_so_far = 0
+        if name is None:
+            name = func.__name__
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            nonlocal iters_so_far
+            if not torch.cuda.is_available():
+                return func(*args, **kwargs)
+            elif iters_so_far < n_warmups:
+                iters_so_far += 1
+                return func(*args, **kwargs)
+            elif n_iters is None or iters_so_far < n_iters + n_warmups:
+                prev_state = begin_cuda_profile()
+                torch.cuda.nvtx.range_push(f"{name}_{iters_so_far}")
+                result = func(*args, **kwargs)
+                torch.cuda.nvtx.range_pop()
+                end_cuda_profile(prev_state)
+                iters_so_far += 1
+                return result
+            else:
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return nvtx

From 10047ba010cf17e72ec54ef8c1fb5067c8c54334 Mon Sep 17 00:00:00 2001
From: Matthias Fey <matthias.fey@tu-dortmund.de>
Date: Tue, 26 Nov 2024 15:32:55 +0100
Subject: [PATCH 09/10] Check that custom edge types actually exist in
 `NumNeighbors` definition (#9807)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CHANGELOG.md                      | 1 +
 test/sampler/test_sampler_base.py | 3 +++
 torch_geometric/sampler/base.py   | 8 ++++++++
 3 files changed, 12 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 341be665fabf..3867de8b1bbb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Changed
 
 - Dropped Python 3.8 support ([#9696](https://github.com/pyg-team/pytorch_geometric/pull/9606))
+- Added a check that confirms that custom edge types of `NumNeighbors` actually exist in the graph ([#9807](https://github.com/pyg-team/pytorch_geometric/pull/9807))
 
 ### Deprecated
 
diff --git a/test/sampler/test_sampler_base.py b/test/sampler/test_sampler_base.py
index dc8142176bf6..41a7da25534f 100644
--- a/test/sampler/test_sampler_base.py
+++ b/test/sampler/test_sampler_base.py
@@ -49,6 +49,9 @@ def test_heterogeneous_num_neighbors_dict_and_default():
 
     num_neighbors = NumNeighbors({('A', 'B'): [25, 10]}, default=[-1, -1])
 
+    with pytest.raises(ValueError, match="Not all edge types"):
+        num_neighbors.get_values([('A', 'C'), ('B', 'A')])
+
     values = num_neighbors.get_values([('A', 'B'), ('B', 'A')])
     assert values == {('A', 'B'): [25, 10], ('B', 'A'): [-1, -1]}
 
diff --git a/torch_geometric/sampler/base.py b/torch_geometric/sampler/base.py
index d67ddd5af79b..1bd2e4346e1d 100644
--- a/torch_geometric/sampler/base.py
+++ b/torch_geometric/sampler/base.py
@@ -425,6 +425,14 @@ def _get_values(
             else:
                 assert False
 
+            # Confirm that `values` only hold valid edge types:
+            if isinstance(self.values, dict):
+                edge_types_str = {EdgeTypeStr(key) for key in edge_types}
+                invalid_edge_types = set(self.values.keys()) - edge_types_str
+                if len(invalid_edge_types) > 0:
+                    raise ValueError("Not all edge types specified in "
+                                     "'num_neighbors' exist in the graph")
+
             out = {}
             for edge_type in edge_types:
                 edge_type_str = EdgeTypeStr(edge_type)

From 46705844b39ededc0fcef1de90e73923480a6446 Mon Sep 17 00:00:00 2001
From: abertics <abby.bertics@gmail.com>
Date: Thu, 28 Nov 2024 08:19:36 -0800
Subject: [PATCH 10/10] Fix typo in Dataset docstring (#9813)

baching -> batching
---
 torch_geometric/data/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_geometric/data/dataset.py b/torch_geometric/data/dataset.py
index dd5239eec7c2..9df4359f1450 100644
--- a/torch_geometric/data/dataset.py
+++ b/torch_geometric/data/dataset.py
@@ -383,7 +383,7 @@ def to_datapipe(self) -> Any:
         r"""Converts the dataset into a :class:`torch.utils.data.DataPipe`.
 
         The returned instance can then be used with :pyg:`PyG's` built-in
-        :class:`DataPipes` for baching graphs as follows:
+        :class:`DataPipes` for batching graphs as follows:
 
         .. code-block:: python