Build a Neural Network from Scratch in JavaScript

Modern deep learning frameworks hide thousands of lines behind a single model.fit(). This tutorial strips everything away. We build a fully-connected neural network in pure JavaScript — matrix multiply, activation functions, forward pass, mean-squared-error loss, backpropagation, and SGD — with zero external dependencies and full source code you can run in any browser.

1. Neurons and Layers

A single artificial neuron computes a weighted sum of its inputs plus a bias, then applies a non-linear activation function:

z = w₁x₁ + w₂x₂ + \dots + wₙxₙ + b (pre-activation) a = σ(z) (post-activation / output)

A layer is a column of neurons processing the same input vector in parallel. Its entire computation is one matrix-vector product:

z = W · x + b W ∈ ℝ^{m×n} (weight matrix, m neurons, n inputs) b ∈ ℝ^m (bias vector) x ∈ ℝ^n (input vector)

A network chains these layers: the output of one becomes the input of the next. Because each layer applies a linear transform followed by a non-linearity, the composition is a universal function approximator (Universal Approximation Theorem, Cybenko 1989).

2. Matrix Operations

We represent matrices as flat Float64Arrays for speed. Key operations we need:

// Lightweight matrix library — rows-major, flat array
function mat(rows, cols, data) {
  return { rows, cols, d: data ? new Float64Array(data) : new Float64Array(rows * cols) };
}
function matMul(A, B) {
  // C = A (m×k) · B (k×n)
  const C = mat(A.rows, B.cols);
  for (let i = 0; i < A.rows; i++)
    for (let j = 0; j < B.cols; j++) {
      let s = 0;
      for (let k = 0; k < A.cols; k++) s += A.d[i * A.cols + k] * B.d[k * B.cols + j];
      C.d[i * B.cols + j] = s;
    }
  return C;
}
function matT(A) {
  const B = mat(A.cols, A.rows);
  for (let i = 0; i < A.rows; i++)
    for (let j = 0; j < A.cols; j++)
      B.d[j * A.rows + i] = A.d[i * A.cols + j];
  return B;
}
function matAdd(A, b) { // broadcast add column vector b to each column of A
  const C = mat(A.rows, A.cols);
  for (let i = 0; i < A.rows; i++)
    for (let j = 0; j < A.cols; j++)
      C.d[i * A.cols + j] = A.d[i * A.cols + j] + b.d[i];
  return C;
}

3. Activation Functions

Sigmoid

σ(z) = 1/(1+e⁻ᶻ) — squashes to (0,1), classic but prone to vanishing gradients. σ'(z) = σ(z)(1−σ(z)).

Tanh

tanh(z) — zero-centered output in (−1,1), generally better than sigmoid for hidden layers. tanh'(z) = 1−tanh²(z).

ReLU

max(0,z) — no saturation for positive values, sparse activation, simple derivative: 0 or 1. Default choice for deep nets.

Softmax

eᶻᵢ/Σeᶻⱼ — converts logit vector to probability distribution, used in output layer for multi-class classification.

const activations = {
  sigmoid: {
    f:  z => 1 / (1 + Math.exp(-z)),
    df: a => a * (1 - a)   // a = σ(z)
  },
  tanh: {
    f:  z => Math.tanh(z),
    df: a => 1 - a * a    // a = tanh(z)
  },
  relu: {
    f:  z => Math.max(0, z),
    df: a => a > 0 ? 1 : 0
  }
};

4. Forward Pass

The forward pass propagates input through every layer, storing the pre-activation Z and post-activation A at each layer — we will need them during backprop:

For layer ℓ = 1 \dots L: Z[ℓ] = W[ℓ] \cdot A[ℓ-1] + b[ℓ] A[ℓ] = σ(Z[ℓ]) (element-wise) A[0] = X (raw input) Ŷ = A[L] (network output)

5. Loss Functions

The loss measures how wrong the network's prediction Ŷ is compared to the ground truth Y:

MSE (regression): L = (1/n) Σ (ŷᵢ - yᵢ)² BCE (binary class): L = -(1/n) Σ [yᵢ log ŷᵢ + (1-yᵢ) log(1-ŷᵢ)] CE (multi-class): L = -(1/n) Σ Σ yᵢⱼ log ŷᵢⱼ Gradient of MSE w.r.t. Ŷ: dL/dŶ = 2(Ŷ - Y)/n

The gradient of the loss with respect to the output layer's activations is the starting point for backpropagation.

6. Backpropagation

Backprop applies the chain rule to propagate the loss gradient backward through each layer. For layer ℓ (going from L down to 1):

δ[L] = dL/dŶ ⊙ σ'(Z[L]) (output-layer error signal) For ℓ = L-1 \dots 1: δ[ℓ] = (W[ℓ+1]ᵀ \cdot δ[ℓ+1]) ⊙ σ'(Z[ℓ]) Gradients: dW[ℓ] = δ[ℓ] \cdot A[ℓ-1]ᵀ / n db[ℓ] = mean(δ[ℓ], axis=samples) SGD update: W[ℓ] \leftarrow W[ℓ] - lr \cdot dW[ℓ] b[ℓ] \leftarrow b[ℓ] - lr \cdot db[ℓ]

⊙ denotes element-wise (Hadamard) product. The key insight is that δ[ℓ] tells each layer how much it contributed to the overall error, allowing each weight to receive a targeted correction.

7. Full Network Implementation

// Fully-connected network — pure JS, zero dependencies
class DenseLayer {
  constructor(inSize, outSize, activation = 'sigmoid') {
    this.activation = activations[activation];
    // Xavier / Glorot initialisation
    const scale = Math.sqrt(2 / (inSize + outSize));
    this.W = mat(outSize, inSize,
      Array.from({length: outSize * inSize}, () => (Math.random() * 2 - 1) * scale));
    this.b = mat(outSize, 1);  // zeros
    this.Z = null; this.A = null; this.A_prev = null;
  }
  forward(A_prev) {
    this.A_prev = A_prev;
    this.Z = matAdd(matMul(this.W, A_prev), this.b);
    this.A = mat(this.Z.rows, this.Z.cols,
      this.Z.d.map(this.activation.f));
    return this.A;
  }
  backward(delta_next) {
    // delta_next is δ from the layer above (already multiplied by W[ℓ+1]ᵀ if any)
    const dZ = mat(this.Z.rows, this.Z.cols,
      delta_next.d.map((v, i) => v * this.activation.df(this.A.d[i])));
    const m = this.A_prev.cols;
    this.dW = scalarDiv(matMul(dZ, matT(this.A_prev)), m);
    this.db = rowMean(dZ);
    return matMul(matT(this.W), dZ);  // δ to pass to layer ℓ−1
  }
  update(lr) {
    for (let i = 0; i < this.W.d.length; i++) this.W.d[i] -= lr * this.dW.d[i];
    for (let i = 0; i < this.b.d.length; i++) this.b.d[i] -= lr * this.db.d[i];
  }
}

class NeuralNetwork {
  constructor(...layers) { this.layers = layers; }
  predict(X) {
    let A = X;
    for (const layer of this.layers) A = layer.forward(A);
    return A;
  }
  train(X, Y, lr = 0.1) {
    const Yhat = this.predict(X);
    // Gradient of MSE: 2*(Yhat-Y)/n
    const m = X.cols;
    let delta = mat(Yhat.rows, Yhat.cols,
      Yhat.d.map((v, i) => 2 * (v - Y.d[i]) / m));
    for (let i = this.layers.length - 1; i >= 0; i--)
      delta = this.layers[i].backward(delta);
    for (const layer of this.layers) layer.update(lr);
    return mseLoss(Yhat, Y);
  }
}

// Utility: scalar divide, row mean
function scalarDiv(M, s) { return mat(M.rows, M.cols, M.d.map(v => v / s)); }
function rowMean(M) {
  const b = mat(M.rows, 1);
  for (let i = 0; i < M.rows; i++) {
    let s = 0;
    for (let j = 0; j < M.cols; j++) s += M.d[i * M.cols + j];
    b.d[i] = s / M.cols;
  }
  return b;
}
function mseLoss(Yhat, Y) {
  return Yhat.d.reduce((s, v, i) => s + (v - Y.d[i]) ** 2, 0) / Yhat.d.length;
}

8. Training on XOR

XOR is the classic non-linearly-separable problem that a single-layer perceptron cannot solve. A hidden layer with at least 2 neurons can:

// XOR dataset — 4 samples, batch training
// Inputs as columns (2 rows × 4 samples)
const X = mat(2, 4, [0,0,1,1,  0,1,0,1]);
const Y = mat(1, 4, [0,1,1,0]);

const net = new NeuralNetwork(
  new DenseLayer(2, 4, 'tanh'),   // hidden: 4 neurons
  new DenseLayer(4, 1, 'sigmoid') // output: 1 neuron
);

for (let epoch = 0; epoch < 10000; epoch++) {
  const loss = net.train(X, Y, 0.5);
  if (epoch % 1000 === 0)
    console.log(`Epoch ${epoch}: loss = ${loss.toFixed(5)}`);
}

// After training, predictions should approximate [0, 1, 1, 0]
const pred = net.predict(X);
console.log('Predictions:', [...pred.d].map(v => v.toFixed(3)));

Running this in the browser console trains in milliseconds. After 10 000 epochs the network reliably outputs values close to [0.02, 0.97, 0.97, 0.02] — correctly solving XOR with no library at all.

Next step: See Backpropagation — the Chain Rule Unwrapped for a deeper look at computational graphs, Jacobians and automatic differentiation.

🧠 Open Neural Network →