diff --git a/.gitattributes b/.gitattributes
index 251a617..5f080b8 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,6 @@
frontend/** linguist-vendored
+frontend/** linguist-vendored
+[submodule "pytorch"]
+ path = pytorch
+ url = https://github.com/pytorch/pytorch
+
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci-approval.yml
similarity index 92%
rename from .github/workflows/ci.yml
rename to .github/workflows/ci-approval.yml
index 992ef59..5068e0d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci-approval.yml
@@ -1,4 +1,4 @@
-name: CI
+name: ci-approval
on:
workflow_dispatch:
@@ -84,7 +84,7 @@ jobs:
run: ./quadtrix --help || true
- name: Upload binary
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v7
with:
name: quadtrix-linux-amd64
path: quadtrix
@@ -111,7 +111,7 @@ jobs:
run: tar -czf quadtrix-macos-arm64.tar.gz quadtrix
- name: Upload binary
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v7
with:
name: quadtrix-macos-arm64
path: quadtrix-macos-arm64.tar.gz
@@ -127,15 +127,15 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: docker/setup-qemu-action@v3
- - uses: docker/setup-buildx-action@v3
+ - uses: docker/setup-qemu-action@v4
+ - uses: docker/setup-buildx-action@v4
- name: Set lowercase image prefix
run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV
- name: Login to GHCR
if: ${{ inputs.push_image == 'true' }}
- uses: docker/login-action@v3
+ uses: docker/login-action@v4
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -143,7 +143,7 @@ jobs:
- name: Extract metadata
id: meta
- uses: docker/metadata-action@v5
+ uses: docker/metadata-action@v6
with:
images: ${{ env.IMAGE_PREFIX }}-cpp
tags: |
@@ -174,15 +174,15 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: docker/setup-qemu-action@v3
- - uses: docker/setup-buildx-action@v3
+ - uses: docker/setup-qemu-action@v4
+ - uses: docker/setup-buildx-action@v4
- name: Set lowercase image prefix
run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV
- name: Login to GHCR
if: ${{ inputs.push_image == 'true' }}
- uses: docker/login-action@v3
+ uses: docker/login-action@v4
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -190,7 +190,7 @@ jobs:
- name: Extract metadata
id: meta
- uses: docker/metadata-action@v5
+ uses: docker/metadata-action@v6
with:
images: ${{ env.IMAGE_PREFIX }}-cpu
tags: |
@@ -221,14 +221,14 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: docker/setup-buildx-action@v3
+ - uses: docker/setup-buildx-action@v4
- name: Set lowercase image prefix
run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV
- name: Login to GHCR
if: ${{ inputs.push_image == 'true' }}
- uses: docker/login-action@v3
+ uses: docker/login-action@v4
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -236,7 +236,7 @@ jobs:
- name: Extract metadata
id: meta
- uses: docker/metadata-action@v5
+ uses: docker/metadata-action@v6
with:
images: ${{ env.IMAGE_PREFIX }}-cuda
tags: |
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index 0986534..205c599 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -41,15 +41,15 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: docker/setup-qemu-action@v3
- - uses: docker/setup-buildx-action@v3
+ - uses: docker/setup-qemu-action@v4
+ - uses: docker/setup-buildx-action@v4
- name: Set lowercase image prefix
run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV"
- name: Login to GHCR
if: ${{ inputs.push_image == 'true' }}
- uses: docker/login-action@v3
+ uses: docker/login-action@v4
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -57,7 +57,7 @@ jobs:
- name: Extract metadata
id: meta
- uses: docker/metadata-action@v5
+ uses: docker/metadata-action@v6
with:
images: ${{ env.IMAGE_PREFIX }}-cpp
tags: |
@@ -88,15 +88,15 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: docker/setup-qemu-action@v3
- - uses: docker/setup-buildx-action@v3
+ - uses: docker/setup-qemu-action@v4
+ - uses: docker/setup-buildx-action@v4
- name: Set lowercase image prefix
run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV"
- name: Login to GHCR
if: ${{ inputs.push_image == 'true' }}
- uses: docker/login-action@v3
+ uses: docker/login-action@v4
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -104,7 +104,7 @@ jobs:
- name: Extract metadata
id: meta
- uses: docker/metadata-action@v5
+ uses: docker/metadata-action@v6
with:
images: ${{ env.IMAGE_PREFIX }}-cpu
tags: |
@@ -135,14 +135,14 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: docker/setup-buildx-action@v3
+ - uses: docker/setup-buildx-action@v4
- name: Set lowercase image prefix
run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV"
- name: Login to GHCR
if: ${{ inputs.push_image == 'true' }}
- uses: docker/login-action@v3
+ uses: docker/login-action@v4
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -150,7 +150,7 @@ jobs:
- name: Extract metadata
id: meta
- uses: docker/metadata-action@v5
+ uses: docker/metadata-action@v6
with:
images: ${{ env.IMAGE_PREFIX }}-cuda
tags: |
diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml
index b50de6b..07b05f2 100644
--- a/.github/workflows/pr-check.yml
+++ b/.github/workflows/pr-check.yml
@@ -150,7 +150,7 @@ jobs:
run: ./quadtrix --help || true
- name: Upload artifact
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v7
with:
name: ${{ matrix.artifact }}
path: quadtrix
@@ -199,19 +199,19 @@ jobs:
exit $failed
- name: Lint — Dockerfile.cpp
- uses: hadolint/hadolint-action@v3.1.0
+ uses: hadolint/hadolint-action@v3.3.0
with:
dockerfile: .devops/Dockerfile.cpp
failure-threshold: error
- name: Lint — Dockerfile (CPU)
- uses: hadolint/hadolint-action@v3.1.0
+ uses: hadolint/hadolint-action@v3.3.0
with:
dockerfile: .devops/Dockerfile
failure-threshold: error
- name: Lint — Dockerfile.backend (CUDA)
- uses: hadolint/hadolint-action@v3.1.0
+ uses: hadolint/hadolint-action@v3.3.0
with:
dockerfile: .devops/Dockerfile.backend
failure-threshold: error
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e92b486..05fd17b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -93,7 +93,7 @@ jobs:
tar -czf "${package}.tar.gz" -C "${ARTIFACT_ROOT}" "${package}"
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v7
with:
name: quadtrix-bin-ubuntu-${{ matrix.build }}-cpu
path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-ubuntu-${{ matrix.build }}-cpu.tar.gz
@@ -142,7 +142,7 @@ jobs:
Compress-Archive -Path "${env:ARTIFACT_ROOT}\${package}\*" -DestinationPath "${package}.zip" -Force
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v7
with:
name: quadtrix-bin-windows-${{ matrix.arch }}-cpu
path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-windows-${{ matrix.arch }}-cpu.zip
@@ -192,7 +192,7 @@ jobs:
tar -czf "${package}.tar.gz" -C "${ARTIFACT_ROOT}" "${package}"
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v7
with:
name: quadtrix-bin-macos-${{ matrix.build }}-cpu
path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-macos-${{ matrix.build }}-cpu.tar.gz
diff --git a/README.md b/README.md
index 7eb276d..bc88dfd 100644
--- a/README.md
+++ b/README.md
@@ -1,140 +1,140 @@
# Quadtrix.cpp
-
-
-
+Language models in dependency-free C++, with no need for 245MB of PyTorch or 107MB of cPython to make a transformer actually work. The native path is a from-scratch decoder-only GPT: tensors, embeddings, multi-head causal self-attention, layer norm, cross-entropy, and a analytical backward pass with AdamW, all in [main.cpp](main.cpp) and [include/](include/). No autograd, no framework — every gradient is derived and written out.
+***technical notes***: [docs](https://eamon2009.github.io/LLMs/)
+Alongside it sits a parallel PyTorch implementation in [engine/main.py](engine/main.py) and [engine/inference.py](engine/inference.py), so you can train and generate the same architecture with `torch` + `tiktoken` when you want speed instead of transparency. A FastAPI middleware layer in [backend/](backend/) and a React/TypeScript web UI in [frontend/](frontend/) let you chat with either backend in the browser. There's also an experimental integrated-GPU path in [iGPU/](engine/iGPU/).
+The point of this repo is the C++ core. The PyTorch, FastAPI, and frontend layers exist to make the model usable, but if you're here to learn how a GPT is actually built and trained without a framework doing the work for you, [include/backward.h](include/backward.h) is where to start reading.
-
-Quadtrix.cpp is an experimental local LLM project that combines training, inference, and a chat interface in a single repository across three execution paths: a C++ versions, a PyTorch backend, and a React frontend.The C++ version is a decoder-only transformer - no external tensor libraries, no automatic differentiation. It includes character-level tokenization, backpropagation, AdamW optimization, checkpointing, and autoregressive generationd. The PyTorch version offers faster training and inference with BPE tokenization via tiktoken, while the React frontend provides a web UI on top of python backend.You can train small character-level models on CPU or Colab, or move to BPE-based models with the PyTorch backend. All version are designed to be hackable and runnable on consumer hardware without requiring massive GPU clusters.
+## quick start (C++, train + chat)
-***technical notes***: [docs](https://eamon2009.github.io/LLMs/)
+The fastest way to see the whole pipeline — tokenize, train, checkpoint, generate — using the bundled character-level corpus:
-## Leaderboard
+```bash
+g++ -std=c++17 -O2 -I. -Iinclude -o quadtrix.exe main.cpp
+./quadtrix.exe data/input.txt
+```
-Runs are ranked by validation loss. Lower is better.
+This trains from scratch on `data/input.txt` and writes the best checkpoint to `best_model.bin`. Once you have a checkpoint, generate or chat with it:
-| # | Val Loss | Parameters | Time | Hardware | Description |
-|--:|----------|------------|----------|----------|------------------------------------------|
-| 1 | **0.7176** | 10.82M | 61.3 min | Colab | Large-scale run, coherent paragraphs, strong convergence |
-| 2 | 0.9250 | 1.99M | 6.1 min | T4 GPU | Optimised run, fast training, stable learning |
-| 3 | 1.3145 | 0.82M | 39.4 min | CPU | Baseline, small data |
-| 4 | 1.6371 | 0.82M | 76.2 min | CPU | Extended CPU training, 3000 iterations |
+```bash
+./quadtrix.exe data/input.txt --generate
+./quadtrix.exe data/input.txt --chat --chat-tokens 300
+```
-All runs: @Eamon2009, 2026.
+debugging tip: drop `-O2` for `-g` when compiling if you want to step through `include/backward.h` or `include/gpt.h` in a debugger — the manual backward pass is much easier to follow one breakpoint at a time.
-## Benchmarks
+### runtime arguments
-### Runs at a Glance
+```bash
+quadtrix.exe [data_path] [--generate] [--chat] [--chat-tokens N]
+```
-| Metric | Character-Level | Small Scale | Large Scale |
-|-------------------|------------------|-------------|-------------|
-| Parameters | 0.83M | 2.00M | 19.17M |
-| Layers | 4 | 4 | 4 |
-| Embedding dim | 128 | 200 | 200 |
-| Attention heads | 4 | 4 | 4 |
-| Context length | 64 | 200 | 200 |
-| Vocab | 105 char | 110 char | ~50K BPE |
-| Corpus | TinyStories | TinyStories | Children's Stories |
-| Iterations | 3,000 | 5,000 | 5,000 |
-| Train loss | 1.5632 | 0.9045 | — |
-| Val loss | 1.6371 | 0.9301 | — |
-| Gen. gap | 0.0739 | 0.0256 | — |
+| Argument | Description |
+|---|---|
+| `data_path` | Plain-text corpus used to build the tokenizer and train/validation split |
+| `--generate` | Load weights and continuously generate text |
+| `--chat` | Load weights and start interactive terminal chat |
+| `--chat-tokens N` | Max generated tokens per chat response |
+
+| Env var | Default | Description |
+|---|---|---|
+| `GPT_DATA_PATH` | `data/input.txt` | Override the default training corpus |
+| `GPT_MODEL_PATH` | `best_model.bin` | Override the checkpoint path |
+
+## what's actually implemented in C++
+
+No third-party runtime dependency — it builds from `main.cpp`, `config/config.h`, and `include/*.h` alone.
+
+- Character-level tokenizer built directly from the input corpus
+- Train/validation split via `DataLoader`
+- Token + positional embeddings
+- Multi-head causal self-attention with explicit QKV projections
+- Pre-layer-norm residual transformer blocks
+- Feed-forward MLP with ReLU
+- Cross-entropy loss
+- **Fully analytical backward pass** — every gradient (attention, layer norm, MLP, embeddings) is derived and coded in `include/backward.h`, not autograd
+- AdamW optimizer (first/second moment estimates, weight decay)
+- Checkpoint save/load
+- Autoregressive generation and terminal chat mode
+
+Hyperparameters live in `config/config.h` and require a rebuild to take effect:
+
+```cpp
+static const int BATCH_SIZE = 4;
+static const int BLOCK_SIZE = 64;
+static const int N_EMBD = 128;
+static const int N_HEAD = 4;
+static const int N_LAYER = 4;
+static const float DROPOUT = 0.2f;
+static const float LEARNING_RATE = 3e-4f;
+static const int MAX_ITERS = 3000;
+```
-### Comparison to Related Projects
+For an optimized native build:
-| Project | Focus | Language | Autograd |
-|---------------|--------------------------------------------|-------------------|----------------------|
-| nanoGPT | Minimal GPT training | Python | PyTorch |
-| minGPT | Educational GPT | Python | PyTorch |
-| llama2.c | Inference-oriented C | C | None |
-| **Quadtrix.cpp** | Training + inference + web UI + multi-backend | C++ / Python / TypeScript | Manual C++ + PyTorch |
+```bash
+g++ -std=c++17 -O3 -march=native -I. -Iinclude -o quadtrix.exe main.cpp
+```
+## the PyTorch reference path
-# Getting started
-**Create a Python virtual environment**
+[engine/main.py](engine/main.py) trains the same architectural idea with `torch`, `torch.nn`, and GPT-2 BPE tokenization via `tiktoken`, useful when you want to scale past what C++ loops can comfortably train on CPU.
-```powershell
-python -m venv .venv
-.\.venv\Scripts\python.exe -m pip install --upgrade pip
+```bash
+python engine/main.py
```
-** Install backend and PyTorch dependencies**
+It looks for `engine/input.txt` by default; point it elsewhere with `QUADTRIX_TRAIN_DATA` if needed. Run inference against a saved checkpoint:
-```powershell
-cd backend
-..\.venv\Scripts\python.exe -m pip install -r requirements.txt
-cd ..
-```
-**Train**
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\.venv\Scripts\python.exe engine\main.py
+```bash
+python engine/inference.py --checkpoint engine/best_model.pt --prompt "Once upon a time" --max-new-tokens 100
```
-**Build the C++ executable**
-Skip if `quadtrix.exe` already exists. To rebuild:
+## web chat (FastAPI + React)
-```powershell
-g++ -std=c++17 -O2 -I. -Iinclude -o quadtrix.exe main.cpp
-```
+To chat with either backend from a browser instead of the terminal, bring up the API and the frontend in two terminals:
-For maximum CPU throughput:
+```bash
+# terminal 1 — backend
+cd backend && uvicorn main:app --host 127.0.0.1 --port 3001
-```powershell
-g++ -std=c++17 -O3 -march=native -I. -Iinclude -o quadtrix.exe main.cpp
+# terminal 2 — frontend
+cd frontend && npm run dev
```
----
+Then open `http://localhost:5173` and select a backend. The PyTorch path works out of the box once a `.pt` checkpoint exists; the C++ backend option expects a compatible HTTP service at `CPP_SERVER_URL` exposing `/health` and `/generate`, which `main.cpp` does not currently serve on its own — use the PyTorch backend for the web UI unless you've built that bridge.
+## results so far
-## Technical Reference
-***Quadtrix is a decoder-only transformer. The architecture follows the standard GPT design with pre-layer normalization, causal self-attention, and residual connections.***
-The C++ version is :
+
-- Character-level tokenizer
-- Manual tensor operations
-- Analytical backpropagation
-- AdamW optimizer with bias correction
-- Checkpoint save/load
-- Autoregressive generation
+| Run | Params | Val loss | Time | Notes |
+|---|---|---|---|---|
+| C++ CPU baseline | 0.82M | 1.31 | 39.4 min | small data, fragmented output |
+| C++ CPU extended | 0.83M | 1.64 | 76.2 min | 3,000 iters, char-level, 28.3M train tokens |
+| T4 | 10.82M | 0.72 | 61.3 min | coherent paragraphs, strong convergence |
+| T4 optimized | 1.99M | 0.93 | 6.1 min | fast, stable, basic coherence |
-**The PyTorch path uses torch.nn and tiktoken for faster experimentation and GPU acceleration.**
+See [run.md](run.md) and the leaderboard in the full docs for more configurations.
-## File Structure
-```
-Quadtrix.cpp/
-├── main.cpp
-├── config/
-│ └── config.h
-├── include/
-│ ├── tensor.h
-│ ├── gpt.h
-│ └── backward.h
-│ └── ...
-│
-├── data/
-│ └── input.txt
-├── engine/
-│ ├── main.py
-│ ├── inference.py
-│ └── best_model.pt
-├── backend/
-│ ├── main.py
-│ ├── inference.py
-│ └── requirements.txt
-├── frontend/
-│ ├── src/
-│ └── ...
-└── quadtrix.exe (after build)
-```
+## how this differs from similar projects
+
+| Project | Focus | Language | Autograd |
+|---|---|---|---|
+| nanoGPT / minGPT | Minimal, educational GPT training | Python | PyTorch |
+| llama2.c | Inference-only | C | None |
+| **Quadtrix.cpp** | Training *and* inference, manual backward pass, web UI | C++ / Python / TypeScript | Manual (C++) + PyTorch |
+
+I'd like the C++ core (`main.cpp`, `include/`, `config/`) to stay dependency-free and to stay the part of this repo that explin transformer internals directly. The PyTorch engine, FastAPI middleware, and React frontend are welcome to grow more features, integrations, and UI polish. If you build a port to another language or framework, I'm happy to link to it from a notable-forks section; just open an issue or PR.
+
+## references
+
+- Vaswani et al., "Attention Is All You Need", 2017
+- Radford et al., GPT-2 technical work, 2019
+- nanoGPT and minGPT as educational reference points
-## References
-- Vaswani et al., "Attention Is All You Need", NeurIPS 2017.
-- Radford et al., "Language Models are Unsupervised Multitask Learners", 2019.
-- Karpathy, nanoGPT
-- Karpathy, minGPT
+## license
-## License
MIT
diff --git a/backend/requirements.txt b/backend/requirements.txt
index cec57cc..960d38f 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -1,8 +1,8 @@
fastapi==0.115.6
uvicorn[standard]==0.34.0
-pydantic==2.10.4
+pydantic==2.13.4
pydantic-settings==2.7.1
httpx==0.28.1
-redis==5.2.1
+redis==8.0.1
torch
tiktoken
diff --git a/docs/quadtrix_training_report.png b/docs/quadtrix_training_report.png
new file mode 100644
index 0000000..1add67d
Binary files /dev/null and b/docs/quadtrix_training_report.png differ
diff --git a/engine/main.py b/engine/main.py
index 78b5eb3..5c15109 100644
--- a/engine/main.py
+++ b/engine/main.py
@@ -63,17 +63,19 @@ def success(msg): log(f" ok {msg}")
train_split = 0.9
seed = 1337
-batch_size = 16
-block_size = 32
-max_iters = 20000
-eval_interval = 100
-learning_rate = 1e-3
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
-eval_iters = 20
-n_embd = 64
-n_head = 4
-n_layer = 4
dropout = 0.1
+block_size = 256
+n_embd = 192
+n_head = 6
+n_layer = 6
+batch_size = 64
+max_iters = 5000
+eval_interval = 250
+learning_rate = 6e-4
+eval_iters = 200
+dropout = 0.1
torch.manual_seed(seed)
diff --git a/train_test/model.py b/train_test/model.py
new file mode 100644
index 0000000..33425e9
--- /dev/null
+++ b/train_test/model.py
@@ -0,0 +1,131 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Optional, Tuple
+import os
+import tiktoken
+
+# File Paths
+_project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+_model_path = os.path.join(_project_root, 'best_model.pt')
+_script_path = os.path.join(_project_root, 'best_model_script.pt')
+
+# Hyperparameters (Synced perfectly with main.py)
+block_size = 12
+n_embd = 64
+n_head = 4
+n_layer = 4
+dropout = 0.1
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+# Tokenizer (Updated to use Tiktoken GPT-2 encoding from enginw/main.py)
+tokenizer = tiktoken.get_encoding("gpt2")
+vocab_size = tokenizer.n_vocab
+def encode(s): return tokenizer.encode(s)
+def decode(l): return tokenizer.decode(l)
+
+
+chars = []
+
+
+# Model classes
+class Head(nn.Module):
+ def __init__(self, head_size: int):
+ super().__init__()
+ self.key = nn.Linear(n_embd, head_size, bias=False)
+ self.query = nn.Linear(n_embd, head_size, bias=False)
+ self.value = nn.Linear(n_embd, head_size, bias=False)
+ self.register_buffer('tril', torch.tril(
+ torch.ones(block_size, block_size)))
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ B, T, C = x.shape
+ k = self.key(x)
+ q = self.query(x)
+ wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+ wei = F.softmax(wei, dim=-1)
+ wei = self.dropout(wei)
+ return wei @ self.value(x)
+
+
+class MultiHeadAttention(nn.Module):
+ def __init__(self, num_heads: int, head_size: int):
+ super().__init__()
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+ self.proj = nn.Linear(head_size * num_heads, n_embd)
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ out = torch.cat([h(x) for h in self.heads], dim=-1)
+ return self.dropout(self.proj(out))
+
+
+class FeedFoward(nn.Module):
+ def __init__(self, n_embd: int):
+ super().__init__()
+ self.net = nn.Sequential(
+ nn.Linear(n_embd, 4 * n_embd),
+ nn.ReLU(),
+ nn.Linear(4 * n_embd, n_embd),
+ nn.Dropout(dropout),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.net(x)
+
+
+class Block(nn.Module):
+ def __init__(self, n_embd: int, n_head: int):
+ super().__init__()
+ head_size = n_embd // n_head
+ self.sa = MultiHeadAttention(n_head, head_size)
+ self.ffwd = FeedFoward(n_embd)
+ self.ln1 = nn.LayerNorm(n_embd)
+ self.ln2 = nn.LayerNorm(n_embd)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = x + self.sa(self.ln1(x))
+ x = x + self.ffwd(self.ln2(x))
+ return x
+
+
+class GPTLanguageModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+ self.position_embedding_table = nn.Embedding(block_size, n_embd)
+ self.blocks = nn.Sequential(
+ *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+ self.ln_f = nn.LayerNorm(n_embd)
+ self.lm_head = nn.Linear(n_embd, vocab_size)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, module: nn.Module) -> None:
+ if isinstance(module, nn.Linear):
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+ if module.bias is not None:
+ torch.nn.init.zeros_(module.bias)
+ elif isinstance(module, nn.Embedding):
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+ def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ B, T = idx.shape
+ tok_emb = self.token_embedding_table(idx)
+ pos_emb = self.position_embedding_table(
+ torch.arange(T, device=idx.device))
+ x = tok_emb + pos_emb
+ x = self.blocks(x)
+ x = self.ln_f(x)
+ logits = self.lm_head(x)
+
+ if targets is None:
+ loss = None
+ else:
+ B, T, C = logits.shape
+ logits = logits.view(B * T, C)
+ targets = targets.view(B * T)
+ loss = F.cross_entropy(logits, targets)
+
+ return logits, loss
diff --git a/train_test/test.cpp b/train_test/test.cpp
new file mode 100644
index 0000000..41e25e7
--- /dev/null
+++ b/train_test/test.cpp
@@ -0,0 +1,96 @@
+#include
+#include
+#include
+#include
+#include