DATA_HUB = dict() DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' import numpy as np import tensorflow as tf nn_Module = tf.keras.Model ################# WARNING ################ # The below part is generated automatically through: # d2lbook build lib # Don't edit it directly import collections import hashlib import inspect import math import os import random import re import shutil import sys import tarfile import time import zipfile from collections import defaultdict import pandas as pd import requests from IPython import display from matplotlib import pyplot as plt from matplotlib_inline import backend_inline d2l = sys.modules[__name__] import numpy as np import tensorflow as tf def use_svg_display(): """Use the svg format to display a plot in Jupyter. Defined in :numref:`sec_calculus`""" backend_inline.set_matplotlib_formats('svg') def set_figsize(figsize=(3.5, 2.5)): """Set the figure size for matplotlib. Defined in :numref:`sec_calculus`""" use_svg_display() d2l.plt.rcParams['figure.figsize'] = figsize def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend): """Set the axes for matplotlib. Defined in :numref:`sec_calculus`""" axes.set_xlabel(xlabel), axes.set_ylabel(ylabel) axes.set_xscale(xscale), axes.set_yscale(yscale) axes.set_xlim(xlim), axes.set_ylim(ylim) if legend: axes.legend(legend) axes.grid() def plot(X, Y=None, xlabel=None, ylabel=None, legend=[], xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None): """Plot data points. Defined in :numref:`sec_calculus`""" def has_one_axis(X): # True if X (tensor or list) has 1 axis return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list) and not hasattr(X[0], "__len__")) if has_one_axis(X): X = [X] if Y is None: X, Y = [[]] * len(X), X elif has_one_axis(Y): Y = [Y] if len(X) != len(Y): X = X * len(Y) set_figsize(figsize) if axes is None: axes = d2l.plt.gca() axes.cla() for x, y, fmt in zip(X, Y, fmts): axes.plot(x,y,fmt) if len(x) else axes.plot(y,fmt) set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend) def add_to_class(Class): """Register functions as methods in created class. Defined in :numref:`sec_oo-design`""" def wrapper(obj): setattr(Class, obj.__name__, obj) return wrapper class HyperParameters: """The base class of hyperparameters.""" def save_hyperparameters(self, ignore=[]): """Defined in :numref:`sec_oo-design`""" raise NotImplemented def save_hyperparameters(self, ignore=[]): """Save function arguments into class attributes. Defined in :numref:`sec_utils`""" frame = inspect.currentframe().f_back _, _, _, local_vars = inspect.getargvalues(frame) self.hparams = {k:v for k, v in local_vars.items() if k not in set(ignore+['self']) and not k.startswith('_')} for k, v in self.hparams.items(): setattr(self, k, v) class ProgressBoard(d2l.HyperParameters): """The board that plots data points in animation. Defined in :numref:`sec_oo-design`""" def __init__(self, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', ls=['-', '--', '-.', ':'], colors=['C0', 'C1', 'C2', 'C3'], fig=None, axes=None, figsize=(3.5, 2.5), display=True): self.save_hyperparameters() def draw(self, x, y, label, every_n=1): raise NotImplemented def draw(self, x, y, label, every_n=1): """Defined in :numref:`sec_utils`""" Point = collections.namedtuple('Point', ['x', 'y']) if not hasattr(self, 'raw_points'): self.raw_points = collections.OrderedDict() self.data = collections.OrderedDict() if label not in self.raw_points: self.raw_points[label] = [] self.data[label] = [] points = self.raw_points[label] line = self.data[label] points.append(Point(x, y)) if len(points) != every_n: return mean = lambda x: sum(x) / len(x) line.append(Point(mean([p.x for p in points]), mean([p.y for p in points]))) points.clear() if not self.display: return d2l.use_svg_display() if self.fig is None: self.fig = d2l.plt.figure(figsize=self.figsize) plt_lines, labels = [], [] for (k, v), ls, color in zip(self.data.items(), self.ls, self.colors): plt_lines.append(d2l.plt.plot([p.x for p in v], [p.y for p in v], linestyle=ls, color=color)[0]) labels.append(k) axes = self.axes if self.axes else d2l.plt.gca() if self.xlim: axes.set_xlim(self.xlim) if self.ylim: axes.set_ylim(self.ylim) if not self.xlabel: self.xlabel = self.x axes.set_xlabel(self.xlabel) axes.set_ylabel(self.ylabel) axes.set_xscale(self.xscale) axes.set_yscale(self.yscale) axes.legend(plt_lines, labels) display.display(self.fig) display.clear_output(wait=True) class Module(d2l.nn_Module, d2l.HyperParameters): """The base class of models. Defined in :numref:`sec_oo-design`""" def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1): super().__init__() self.save_hyperparameters() self.board = ProgressBoard() self.training = None def loss(self, y_hat, y): raise NotImplementedError def forward(self, X): assert hasattr(self, 'net'), 'Neural network is defined' return self.net(X) def call(self, X, *args, **kwargs): if kwargs and "training" in kwargs: self.training = kwargs['training'] return self.forward(X, *args) def plot(self, key, value, train): """Plot a point in animation.""" assert hasattr(self, 'trainer'), 'Trainer is not inited' self.board.xlabel = 'epoch' if train: x = self.trainer.train_batch_idx / \ self.trainer.num_train_batches n = self.trainer.num_train_batches / \ self.plot_train_per_epoch else: x = self.trainer.epoch + 1 n = self.trainer.num_val_batches / \ self.plot_valid_per_epoch self.board.draw(x, d2l.numpy(value), ( 'train_' if train else 'val_') + key, every_n=int(n)) def training_step(self, batch): l = self.loss(self(*batch[:-1]), batch[-1]) self.plot('loss', l, train=True) return l def validation_step(self, batch): l = self.loss(self(*batch[:-1]), batch[-1]) self.plot('loss', l, train=False) def configure_optimizers(self): raise NotImplementedError def configure_optimizers(self): """Defined in :numref:`sec_classification`""" return tf.keras.optimizers.SGD(self.lr) class DataModule(d2l.HyperParameters): """The base class of data. Defined in :numref:`subsec_oo-design-models`""" def __init__(self, root='../data'): self.save_hyperparameters() def get_dataloader(self, train): raise NotImplementedError def train_dataloader(self): return self.get_dataloader(train=True) def val_dataloader(self): return self.get_dataloader(train=False) def get_tensorloader(self, tensors, train, indices=slice(0, None)): """Defined in :numref:`sec_synthetic-regression-data`""" tensors = tuple(a[indices] for a in tensors) shuffle_buffer = tensors[0].shape[0] if train else 1 return tf.data.Dataset.from_tensor_slices(tensors).shuffle( buffer_size=shuffle_buffer).batch(self.batch_size) class Trainer(d2l.HyperParameters): """The base class for training models with data. Defined in :numref:`subsec_oo-design-models`""" def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0): self.save_hyperparameters() assert num_gpus == 0, 'No GPU support yet' def prepare_data(self, data): self.train_dataloader = data.train_dataloader() self.val_dataloader = data.val_dataloader() self.num_train_batches = len(self.train_dataloader) self.num_val_batches = (len(self.val_dataloader) if self.val_dataloader is not None else 0) def prepare_model(self, model): model.trainer = self model.board.xlim = [0, self.max_epochs] self.model = model def fit(self, model, data): self.prepare_data(data) self.prepare_model(model) self.optim = model.configure_optimizers() self.epoch = 0 self.train_batch_idx = 0 self.val_batch_idx = 0 for self.epoch in range(self.max_epochs): self.fit_epoch() def fit_epoch(self): raise NotImplementedError def prepare_batch(self, batch): """Defined in :numref:`sec_linear_scratch`""" return batch def fit_epoch(self): """Defined in :numref:`sec_linear_scratch`""" self.model.training = True for batch in self.train_dataloader: with tf.GradientTape() as tape: loss = self.model.training_step(self.prepare_batch(batch)) grads = tape.gradient(loss, self.model.trainable_variables) if self.gradient_clip_val > 0: grads = self.clip_gradients(self.gradient_clip_val, grads) self.optim.apply_gradients(zip(grads, self.model.trainable_variables)) self.train_batch_idx += 1 if self.val_dataloader is None: return self.model.training = False for batch in self.val_dataloader: self.model.validation_step(self.prepare_batch(batch)) self.val_batch_idx += 1 def clip_gradients(self, grad_clip_val, grads): """Defined in :numref:`sec_rnn-scratch`""" grad_clip_val = tf.constant(grad_clip_val, dtype=tf.float32) new_grads = [tf.convert_to_tensor(grad) if isinstance( grad, tf.IndexedSlices) else grad for grad in grads] norm = tf.math.sqrt(sum((tf.reduce_sum(grad ** 2)) for grad in new_grads)) if tf.greater(norm, grad_clip_val): for i, grad in enumerate(new_grads): new_grads[i] = grad * grad_clip_val / norm return new_grads return grads class SyntheticRegressionData(d2l.DataModule): """Synthetic data for linear regression. Defined in :numref:`sec_synthetic-regression-data`""" def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000, batch_size=32): super().__init__() self.save_hyperparameters() n = num_train + num_val self.X = tf.random.normal((n, w.shape[0])) noise = tf.random.normal((n, 1)) * noise self.y = d2l.matmul(self.X, d2l.reshape(w, (-1, 1))) + b + noise def get_dataloader(self, train): """Defined in :numref:`sec_synthetic-regression-data`""" i = slice(0, self.num_train) if train else slice(self.num_train, None) return self.get_tensorloader((self.X, self.y), train, i) class LinearRegressionScratch(d2l.Module): """The linear regression model implemented from scratch. Defined in :numref:`sec_linear_scratch`""" def __init__(self, num_inputs, lr, sigma=0.01): super().__init__() self.save_hyperparameters() w = tf.random.normal((num_inputs, 1), mean=0, stddev=0.01) b = tf.zeros(1) self.w = tf.Variable(w, trainable=True) self.b = tf.Variable(b, trainable=True) def forward(self, X): """Defined in :numref:`sec_linear_scratch`""" return d2l.matmul(X, self.w) + self.b def loss(self, y_hat, y): """Defined in :numref:`sec_linear_scratch`""" l = (y_hat - y) ** 2 / 2 return d2l.reduce_mean(l) def configure_optimizers(self): """Defined in :numref:`sec_linear_scratch`""" return SGD(self.lr) class SGD(d2l.HyperParameters): """Minibatch stochastic gradient descent. Defined in :numref:`sec_linear_scratch`""" def __init__(self, lr): self.save_hyperparameters() def apply_gradients(self, grads_and_vars): for grad, param in grads_and_vars: param.assign_sub(self.lr * grad) class LinearRegression(d2l.Module): """The linear regression model implemented with high-level APIs. Defined in :numref:`sec_linear_concise`""" def __init__(self, lr): super().__init__() self.save_hyperparameters() initializer = tf.initializers.RandomNormal(stddev=0.01) self.net = tf.keras.layers.Dense(1, kernel_initializer=initializer) def forward(self, X): """Defined in :numref:`sec_linear_concise`""" return self.net(X) def loss(self, y_hat, y): """Defined in :numref:`sec_linear_concise`""" fn = tf.keras.losses.MeanSquaredError() return fn(y, y_hat) def configure_optimizers(self): """Defined in :numref:`sec_linear_concise`""" return tf.keras.optimizers.SGD(self.lr) def get_w_b(self): """Defined in :numref:`sec_linear_concise`""" return (self.get_weights()[0], self.get_weights()[1]) class FashionMNIST(d2l.DataModule): """The Fashion-MNIST dataset. Defined in :numref:`sec_fashion_mnist`""" def __init__(self, batch_size=64, resize=(28, 28)): super().__init__() self.save_hyperparameters() self.train, self.val = tf.keras.datasets.fashion_mnist.load_data() def text_labels(self, indices): """Return text labels. Defined in :numref:`sec_fashion_mnist`""" labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [labels[int(i)] for i in indices] def get_dataloader(self, train): """Defined in :numref:`sec_fashion_mnist`""" data = self.train if train else self.val process = lambda X, y: (tf.expand_dims(X, axis=3) / 255, tf.cast(y, dtype='int32')) resize_fn = lambda X, y: (tf.image.resize_with_pad(X, *self.resize), y) shuffle_buf = len(data[0]) if train else 1 return tf.data.Dataset.from_tensor_slices(process(*data)).batch( self.batch_size).map(resize_fn).shuffle(shuffle_buf) def visualize(self, batch, nrows=1, ncols=8, labels=[]): """Defined in :numref:`sec_fashion_mnist`""" X, y = batch if not labels: labels = self.text_labels(y) d2l.show_images(tf.squeeze(X), nrows, ncols, titles=labels) def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): """Plot a list of images. Defined in :numref:`sec_fashion_mnist`""" raise NotImplementedError class Classifier(d2l.Module): """The base class of classification models. Defined in :numref:`sec_classification`""" def validation_step(self, batch): Y_hat = self(*batch[:-1]) self.plot('loss', self.loss(Y_hat, batch[-1]), train=False) self.plot('acc', self.accuracy(Y_hat, batch[-1]), train=False) def accuracy(self, Y_hat, Y, averaged=True): """Compute the number of correct predictions. Defined in :numref:`sec_classification`""" Y_hat = d2l.reshape(Y_hat, (-1, Y_hat.shape[-1])) preds = d2l.astype(d2l.argmax(Y_hat, axis=1), Y.dtype) compare = d2l.astype(preds == d2l.reshape(Y, -1), d2l.float32) return d2l.reduce_mean(compare) if averaged else compare def loss(self, Y_hat, Y, averaged=True): """Defined in :numref:`sec_softmax_concise`""" Y_hat = d2l.reshape(Y_hat, (-1, Y_hat.shape[-1])) Y = d2l.reshape(Y, (-1,)) fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) return fn(Y, Y_hat) def layer_summary(self, X_shape): """Defined in :numref:`sec_lenet`""" X = d2l.normal(X_shape) for layer in self.net.layers: X = layer(X) print(layer.__class__.__name__, 'output shape:\t', X.shape) class SoftmaxRegression(d2l.Classifier): """The softmax regression model. Defined in :numref:`sec_softmax_concise`""" def __init__(self, num_outputs, lr): super().__init__() self.save_hyperparameters() self.net = tf.keras.models.Sequential() self.net.add(tf.keras.layers.Flatten()) self.net.add(tf.keras.layers.Dense(num_outputs)) def forward(self, X): return self.net(X) def cpu(): """Get the CPU device. Defined in :numref:`sec_use_gpu`""" return tf.device('/CPU:0') def gpu(i=0): """Get a GPU device. Defined in :numref:`sec_use_gpu`""" return tf.device(f'/GPU:{i}') def num_gpus(): """Get the number of available GPUs. Defined in :numref:`sec_use_gpu`""" return len(tf.config.experimental.list_physical_devices('GPU')) def try_gpu(i=0): """Return gpu(i) if exists, otherwise return cpu(). Defined in :numref:`sec_use_gpu`""" if num_gpus() >= i + 1: return gpu(i) return cpu() def try_all_gpus(): """Return all available GPUs, or [cpu(),] if no GPU exists. Defined in :numref:`sec_use_gpu`""" return [gpu(i) for i in range(num_gpus())] def corr2d(X, K): """Compute 2D cross-correlation. Defined in :numref:`sec_conv_layer`""" h, w = K.shape Y = tf.Variable(tf.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))) for i in range(Y.shape[0]): for j in range(Y.shape[1]): Y[i, j].assign(tf.reduce_sum( X[i: i + h, j: j + w] * K)) return Y class LeNet(d2l.Classifier): """The LeNet-5 model. Defined in :numref:`sec_lenet`""" def __init__(self, lr=0.1, num_classes=10): super().__init__() self.save_hyperparameters() self.net = tf.keras.models.Sequential([ tf.keras.layers.Conv2D(filters=6, kernel_size=5, activation='sigmoid', padding='same'), tf.keras.layers.AvgPool2D(pool_size=2, strides=2), tf.keras.layers.Conv2D(filters=16, kernel_size=5, activation='sigmoid'), tf.keras.layers.AvgPool2D(pool_size=2, strides=2), tf.keras.layers.Flatten(), tf.keras.layers.Dense(120, activation='sigmoid'), tf.keras.layers.Dense(84, activation='sigmoid'), tf.keras.layers.Dense(num_classes)]) class Residual(tf.keras.Model): """The Residual block of ResNet models. Defined in :numref:`sec_resnet`""" def __init__(self, num_channels, use_1x1conv=False, strides=1): super().__init__() self.conv1 = tf.keras.layers.Conv2D(num_channels, padding='same', kernel_size=3, strides=strides) self.conv2 = tf.keras.layers.Conv2D(num_channels, kernel_size=3, padding='same') self.conv3 = None if use_1x1conv: self.conv3 = tf.keras.layers.Conv2D(num_channels, kernel_size=1, strides=strides) self.bn1 = tf.keras.layers.BatchNormalization() self.bn2 = tf.keras.layers.BatchNormalization() def call(self, X): Y = tf.keras.activations.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3 is not None: X = self.conv3(X) Y += X return tf.keras.activations.relu(Y) class ResNeXtBlock(tf.keras.Model): """The ResNeXt block. Defined in :numref:`subsec_residual-blks`""" def __init__(self, num_channels, groups, bot_mul, use_1x1conv=False, strides=1): super().__init__() bot_channels = int(round(num_channels * bot_mul)) self.conv1 = tf.keras.layers.Conv2D(bot_channels, 1, strides=1) self.conv2 = tf.keras.layers.Conv2D(bot_channels, 3, strides=strides, padding="same", groups=bot_channels//groups) self.conv3 = tf.keras.layers.Conv2D(num_channels, 1, strides=1) self.bn1 = tf.keras.layers.BatchNormalization() self.bn2 = tf.keras.layers.BatchNormalization() self.bn3 = tf.keras.layers.BatchNormalization() if use_1x1conv: self.conv4 = tf.keras.layers.Conv2D(num_channels, 1, strides=strides) self.bn4 = tf.keras.layers.BatchNormalization() else: self.conv4 = None def call(self, X): Y = tf.keras.activations.relu(self.bn1(self.conv1(X))) Y = tf.keras.activations.relu(self.bn2(self.conv2(Y))) Y = self.bn3(self.conv3(Y)) if self.conv4: X = self.bn4(self.conv4(X)) return tf.keras.activations.relu(Y + X) class TimeMachine(d2l.DataModule): """The Time Machine dataset. Defined in :numref:`sec_text-sequence`""" def _download(self): fname = d2l.download(d2l.DATA_URL + 'timemachine.txt', self.root, '090b5e7e70c295757f55df93cb0a180b9691891a') with open(fname) as f: return f.read() def _preprocess(self, text): """Defined in :numref:`sec_text-sequence`""" return re.sub('[^A-Za-z]+', ' ', text).lower() def _tokenize(self, text): """Defined in :numref:`sec_text-sequence`""" return list(text) def build(self, raw_text, vocab=None): """Defined in :numref:`sec_text-sequence`""" tokens = self._tokenize(self._preprocess(raw_text)) if vocab is None: vocab = Vocab(tokens) corpus = [vocab[token] for token in tokens] return corpus, vocab def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000): """Defined in :numref:`sec_language-model`""" super(d2l.TimeMachine, self).__init__() self.save_hyperparameters() corpus, self.vocab = self.build(self._download()) array = d2l.tensor([corpus[i:i+num_steps+1] for i in range(len(corpus)-num_steps)]) self.X, self.Y = array[:,:-1], array[:,1:] def get_dataloader(self, train): """Defined in :numref:`subsec_partitioning-seqs`""" idx = slice(0, self.num_train) if train else slice( self.num_train, self.num_train + self.num_val) return self.get_tensorloader([self.X, self.Y], train, idx) class Vocab: """Vocabulary for text.""" def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]): """Defined in :numref:`sec_text-sequence`""" # Flatten a 2D list if needed if tokens and isinstance(tokens[0], list): tokens = [token for line in tokens for token in line] # Count token frequencies counter = collections.Counter(tokens) self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # The list of unique tokens self.idx_to_token = list(sorted(set([''] + reserved_tokens + [ token for token, freq in self.token_freqs if freq >= min_freq]))) self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)} def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if hasattr(indices, '__len__') and len(indices) > 1: return [self.idx_to_token[int(index)] for index in indices] return self.idx_to_token[indices] @property def unk(self): # Index for the unknown token return self.token_to_idx[''] class RNNScratch(d2l.Module): """The RNN model implemented from scratch. Defined in :numref:`sec_rnn-scratch`""" def __init__(self, num_inputs, num_hiddens, sigma=0.01): super().__init__() self.save_hyperparameters() self.W_xh = tf.Variable(d2l.normal( (num_inputs, num_hiddens)) * sigma) self.W_hh = tf.Variable(d2l.normal( (num_hiddens, num_hiddens)) * sigma) self.b_h = tf.Variable(d2l.zeros(num_hiddens)) def forward(self, inputs, state=None): """Defined in :numref:`sec_rnn-scratch`""" if state is None: # Initial state with shape: (batch_size, num_hiddens) state = d2l.zeros((inputs.shape[1], self.num_hiddens)) else: state, = state state = d2l.reshape(state, (-1, self.num_hiddens)) outputs = [] for X in inputs: # Shape of inputs: (num_steps, batch_size, num_inputs) state = d2l.tanh(d2l.matmul(X, self.W_xh) + d2l.matmul(state, self.W_hh) + self.b_h) outputs.append(state) return outputs, state def check_len(a, n): """Check the length of a list. Defined in :numref:`sec_rnn-scratch`""" assert len(a) == n, f'list\'s length {len(a)} != expected length {n}' def check_shape(a, shape): """Check the shape of a tensor. Defined in :numref:`sec_rnn-scratch`""" assert a.shape == shape, \ f'tensor\'s shape {a.shape} != expected shape {shape}' class RNNLMScratch(d2l.Classifier): """The RNN-based language model implemented from scratch. Defined in :numref:`sec_rnn-scratch`""" def __init__(self, rnn, vocab_size, lr=0.01): super().__init__() self.save_hyperparameters() self.init_params() def init_params(self): self.W_hq = tf.Variable(d2l.normal( (self.rnn.num_hiddens, self.vocab_size)) * self.rnn.sigma) self.b_q = tf.Variable(d2l.zeros(self.vocab_size)) def training_step(self, batch): l = self.loss(self(*batch[:-1]), batch[-1]) self.plot('ppl', d2l.exp(l), train=True) return l def validation_step(self, batch): l = self.loss(self(*batch[:-1]), batch[-1]) self.plot('ppl', d2l.exp(l), train=False) def one_hot(self, X): """Defined in :numref:`sec_rnn-scratch`""" # Output shape: (num_steps, batch_size, vocab_size) return tf.one_hot(tf.transpose(X), self.vocab_size) def output_layer(self, rnn_outputs): """Defined in :numref:`sec_rnn-scratch`""" outputs = [d2l.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs] return d2l.stack(outputs, 1) def forward(self, X, state=None): """Defined in :numref:`sec_rnn-scratch`""" embs = self.one_hot(X) rnn_outputs, _ = self.rnn(embs, state) return self.output_layer(rnn_outputs) def predict(self, prefix, num_preds, vocab, device=None): """Defined in :numref:`sec_rnn-scratch`""" state, outputs = None, [vocab[prefix[0]]] for i in range(len(prefix) + num_preds - 1): X = d2l.tensor([[outputs[-1]]]) embs = self.one_hot(X) rnn_outputs, state = self.rnn(embs, state) if i < len(prefix) - 1: # Warm-up period outputs.append(vocab[prefix[i + 1]]) else: # Predict num_preds steps Y = self.output_layer(rnn_outputs) outputs.append(int(d2l.reshape(d2l.argmax(Y, axis=2), 1))) return ''.join([vocab.idx_to_token[i] for i in outputs]) class RNN(d2l.Module): """The RNN model implemented with high-level APIs. Defined in :numref:`sec_rnn-concise`""" def __init__(self, num_hiddens): super().__init__() self.save_hyperparameters() self.rnn = tf.keras.layers.SimpleRNN( num_hiddens, return_sequences=True, return_state=True, time_major=True) def forward(self, inputs, H=None): outputs, H = self.rnn(inputs, H) return outputs, H class RNNLM(d2l.RNNLMScratch): """The RNN-based language model implemented with high-level APIs. Defined in :numref:`sec_rnn-concise`""" def init_params(self): self.linear = tf.keras.layers.Dense(self.vocab_size) def output_layer(self, hiddens): return d2l.transpose(self.linear(hiddens), (1, 0, 2)) class GRU(d2l.RNN): """The multilayer GRU model. Defined in :numref:`sec_deep_rnn`""" def __init__(self, num_hiddens, num_layers, dropout=0): d2l.Module.__init__(self) self.save_hyperparameters() gru_cells = [tf.keras.layers.GRUCell(num_hiddens, dropout=dropout) for _ in range(num_layers)] self.rnn = tf.keras.layers.RNN(gru_cells, return_sequences=True, return_state=True, time_major=True) def forward(self, X, state=None): outputs, *state = self.rnn(X, state) return outputs, state class MTFraEng(d2l.DataModule): """The English-French dataset. Defined in :numref:`sec_machine_translation`""" def _download(self): d2l.extract(d2l.download( d2l.DATA_URL+'fra-eng.zip', self.root, '94646ad1522d915e7b0f9296181140edcf86a4f5')) with open(self.root + '/fra-eng/fra.txt', encoding='utf-8') as f: return f.read() def _preprocess(self, text): """Defined in :numref:`sec_machine_translation`""" # Replace non-breaking space with space text = text.replace('\u202f', ' ').replace('\xa0', ' ') # Insert space between words and punctuation marks no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' ' out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text.lower())] return ''.join(out) def _tokenize(self, text, max_examples=None): """Defined in :numref:`sec_machine_translation`""" src, tgt = [], [] for i, line in enumerate(text.split('\n')): if max_examples and i > max_examples: break parts = line.split('\t') if len(parts) == 2: # Skip empty tokens src.append([t for t in f'{parts[0]} '.split(' ') if t]) tgt.append([t for t in f'{parts[1]} '.split(' ') if t]) return src, tgt def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128): """Defined in :numref:`sec_machine_translation`""" super(MTFraEng, self).__init__() self.save_hyperparameters() self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays( self._download()) def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None): """Defined in :numref:`subsec_loading-seq-fixed-len`""" def _build_array(sentences, vocab, is_tgt=False): pad_or_trim = lambda seq, t: ( seq[:t] if len(seq) > t else seq + [''] * (t - len(seq))) sentences = [pad_or_trim(s, self.num_steps) for s in sentences] if is_tgt: sentences = [[''] + s for s in sentences] if vocab is None: vocab = d2l.Vocab(sentences, min_freq=2) array = d2l.tensor([vocab[s] for s in sentences]) valid_len = d2l.reduce_sum( d2l.astype(array != vocab[''], d2l.int32), 1) return array, vocab, valid_len src, tgt = self._tokenize(self._preprocess(raw_text), self.num_train + self.num_val) src_array, src_vocab, src_valid_len = _build_array(src, src_vocab) tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True) return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]), src_vocab, tgt_vocab) def get_dataloader(self, train): """Defined in :numref:`subsec_loading-seq-fixed-len`""" idx = slice(0, self.num_train) if train else slice(self.num_train, None) return self.get_tensorloader(self.arrays, train, idx) def build(self, src_sentences, tgt_sentences): """Defined in :numref:`subsec_loading-seq-fixed-len`""" raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip( src_sentences, tgt_sentences)]) arrays, _, _ = self._build_arrays( raw_text, self.src_vocab, self.tgt_vocab) return arrays def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist): """Plot the histogram for list length pairs. Defined in :numref:`sec_machine_translation`""" d2l.set_figsize() _, _, patches = d2l.plt.hist( [[len(l) for l in xlist], [len(l) for l in ylist]]) d2l.plt.xlabel(xlabel) d2l.plt.ylabel(ylabel) for patch in patches[1].patches: patch.set_hatch('/') d2l.plt.legend(legend) class Encoder(tf.keras.layers.Layer): """The base encoder interface for the encoder--decoder architecture. Defined in :numref:`sec_encoder-decoder`""" def __init__(self): super().__init__() # Later there can be additional arguments (e.g., length excluding padding) def call(self, X, *args): raise NotImplementedError class Decoder(tf.keras.layers.Layer): """The base decoder interface for the encoder--decoder architecture. Defined in :numref:`sec_encoder-decoder`""" def __init__(self): super().__init__() # Later there can be additional arguments (e.g., length excluding padding) def init_state(self, enc_all_outputs, *args): raise NotImplementedError def call(self, X, state): raise NotImplementedError class EncoderDecoder(d2l.Classifier): """The base class for the encoder--decoder architecture. Defined in :numref:`sec_encoder-decoder`""" def __init__(self, encoder, decoder): super().__init__() self.encoder = encoder self.decoder = decoder def call(self, enc_X, dec_X, *args): enc_all_outputs = self.encoder(enc_X, *args, training=True) dec_state = self.decoder.init_state(enc_all_outputs, *args) # Return decoder output only return self.decoder(dec_X, dec_state, training=True)[0] def predict_step(self, batch, device, num_steps, save_attention_weights=False): """Defined in :numref:`sec_seq2seq_training`""" src, tgt, src_valid_len, _ = batch enc_all_outputs = self.encoder(src, src_valid_len, training=False) dec_state = self.decoder.init_state(enc_all_outputs, src_valid_len) outputs, attention_weights = [d2l.expand_dims(tgt[:, 0], 1), ], [] for _ in range(num_steps): Y, dec_state = self.decoder(outputs[-1], dec_state, training=False) outputs.append(d2l.argmax(Y, 2)) # Save attention weights (to be covered later) if save_attention_weights: attention_weights.append(self.decoder.attention_weights) return d2l.concat(outputs[1:], 1), attention_weights class Seq2SeqEncoder(d2l.Encoder): """The RNN encoder for sequence-to-sequence learning. Defined in :numref:`sec_seq2seq`""" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0): super().__init__() self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size) self.rnn = d2l.GRU(num_hiddens, num_layers, dropout) def call(self, X, *args): # X shape: (batch_size, num_steps) embs = self.embedding(d2l.transpose(X)) # embs shape: (num_steps, batch_size, embed_size) outputs, state = self.rnn(embs) # outputs shape: (num_steps, batch_size, num_hiddens) # state shape: (num_layers, batch_size, num_hiddens) return outputs, state class Seq2Seq(d2l.EncoderDecoder): """The RNN encoder--decoder for sequence to sequence learning. Defined in :numref:`sec_seq2seq_decoder`""" def __init__(self, encoder, decoder, tgt_pad, lr): super().__init__(encoder, decoder) self.save_hyperparameters() def validation_step(self, batch): Y_hat = self(*batch[:-1]) self.plot('loss', self.loss(Y_hat, batch[-1]), train=False) def configure_optimizers(self): # Adam optimizer is used here return tf.keras.optimizers.Adam(learning_rate=self.lr) def bleu(pred_seq, label_seq, k): """Compute the BLEU. Defined in :numref:`sec_seq2seq_training`""" pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ') len_pred, len_label = len(pred_tokens), len(label_tokens) score = math.exp(min(0, 1 - len_label / len_pred)) for n in range(1, min(k, len_pred) + 1): num_matches, label_subs = 0, collections.defaultdict(int) for i in range(len_label - n + 1): label_subs[' '.join(label_tokens[i: i + n])] += 1 for i in range(len_pred - n + 1): if label_subs[' '.join(pred_tokens[i: i + n])] > 0: num_matches += 1 label_subs[' '.join(pred_tokens[i: i + n])] -= 1 score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n)) return score def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5), cmap='Reds'): """Show heatmaps of matrices. Defined in :numref:`sec_queries-keys-values`""" d2l.use_svg_display() num_rows, num_cols, _, _ = matrices.shape fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize, sharex=True, sharey=True, squeeze=False) for i, (row_axes, row_matrices) in enumerate(zip(axes, matrices)): for j, (ax, matrix) in enumerate(zip(row_axes, row_matrices)): pcm = ax.imshow(d2l.numpy(matrix), cmap=cmap) if i == num_rows - 1: ax.set_xlabel(xlabel) if j == 0: ax.set_ylabel(ylabel) if titles: ax.set_title(titles[j]) fig.colorbar(pcm, ax=axes, shrink=0.6); def masked_softmax(X, valid_lens): """Perform softmax operation by masking elements on the last axis. Defined in :numref:`sec_attention-scoring-functions`""" # X: 3D tensor, valid_lens: 1D or 2D tensor def _sequence_mask(X, valid_len, value=0): maxlen = X.shape[1] mask = tf.range(start=0, limit=maxlen, dtype=tf.float32)[ None, :] < tf.cast(valid_len[:, None], dtype=tf.float32) if len(X.shape) == 3: return tf.where(tf.expand_dims(mask, axis=-1), X, value) else: return tf.where(mask, X, value) if valid_lens is None: return tf.nn.softmax(X, axis=-1) else: shape = X.shape if len(valid_lens.shape) == 1: valid_lens = tf.repeat(valid_lens, repeats=shape[1]) else: valid_lens = tf.reshape(valid_lens, shape=-1) # On the last axis, replace masked elements with a very large negative # value, whose exponentiation outputs 0 X = _sequence_mask(tf.reshape(X, shape=(-1, shape[-1])), valid_lens, value=-1e6) return tf.nn.softmax(tf.reshape(X, shape=shape), axis=-1) class DotProductAttention(tf.keras.layers.Layer): """Scaled dot product attention. Defined in :numref:`subsec_batch_dot`""" def __init__(self, dropout): super().__init__() self.dropout = tf.keras.layers.Dropout(dropout) # Shape of queries: (batch_size, no. of queries, d) # Shape of keys: (batch_size, no. of key-value pairs, d) # Shape of values: (batch_size, no. of key-value pairs, value dimension) # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries) def call(self, queries, keys, values, valid_lens=None, **kwargs): d = queries.shape[-1] scores = tf.matmul(queries, keys, transpose_b=True)/tf.math.sqrt( tf.cast(d, dtype=tf.float32)) self.attention_weights = masked_softmax(scores, valid_lens) return tf.matmul(self.dropout(self.attention_weights, **kwargs), values) class AdditiveAttention(tf.keras.layers.Layer): """Additive attention. Defined in :numref:`subsec_batch_dot`""" def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs): super().__init__(**kwargs) self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=False) self.W_q = tf.keras.layers.Dense(num_hiddens, use_bias=False) self.w_v = tf.keras.layers.Dense(1, use_bias=False) self.dropout = tf.keras.layers.Dropout(dropout) def call(self, queries, keys, values, valid_lens, **kwargs): queries, keys = self.W_q(queries), self.W_k(keys) # After dimension expansion, shape of queries: (batch_size, no. of # queries, 1, num_hiddens) and shape of keys: (batch_size, 1, no. of # key-value pairs, num_hiddens). Sum them up with broadcasting features = tf.expand_dims(queries, axis=2) + tf.expand_dims( keys, axis=1) features = tf.nn.tanh(features) # There is only one output of self.w_v, so we remove the last # one-dimensional entry from the shape. Shape of scores: (batch_size, # no. of queries, no. of key-value pairs) scores = tf.squeeze(self.w_v(features), axis=-1) self.attention_weights = masked_softmax(scores, valid_lens) # Shape of values: (batch_size, no. of key-value pairs, value # dimension) return tf.matmul(self.dropout( self.attention_weights, **kwargs), values) class AttentionDecoder(d2l.Decoder): """The base attention-based decoder interface. Defined in :numref:`sec_seq2seq_attention`""" def __init__(self): super().__init__() @property def attention_weights(self): raise NotImplementedError class MultiHeadAttention(d2l.Module): """Multi-head attention. Defined in :numref:`sec_multihead-attention`""" def __init__(self, key_size, query_size, value_size, num_hiddens, num_heads, dropout, bias=False, **kwargs): super().__init__() self.num_heads = num_heads self.attention = d2l.DotProductAttention(dropout) self.W_q = tf.keras.layers.Dense(num_hiddens, use_bias=bias) self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=bias) self.W_v = tf.keras.layers.Dense(num_hiddens, use_bias=bias) self.W_o = tf.keras.layers.Dense(num_hiddens, use_bias=bias) def call(self, queries, keys, values, valid_lens, **kwargs): # Shape of queries, keys, or values: # (batch_size, no. of queries or key-value pairs, num_hiddens) # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries) # After transposing, shape of output queries, keys, or values: # (batch_size * num_heads, no. of queries or key-value pairs, # num_hiddens / num_heads) queries = self.transpose_qkv(self.W_q(queries)) keys = self.transpose_qkv(self.W_k(keys)) values = self.transpose_qkv(self.W_v(values)) if valid_lens is not None: # On axis 0, copy the first item (scalar or vector) for num_heads # times, then copy the next item, and so on valid_lens = tf.repeat(valid_lens, repeats=self.num_heads, axis=0) # Shape of output: (batch_size * num_heads, no. of queries, # num_hiddens / num_heads) output = self.attention(queries, keys, values, valid_lens, **kwargs) # Shape of output_concat: (batch_size, no. of queries, num_hiddens) output_concat = self.transpose_output(output) return self.W_o(output_concat) def transpose_qkv(self, X): """Transposition for parallel computation of multiple attention heads. Defined in :numref:`sec_multihead-attention`""" # Shape of input X: (batch_size, no. of queries or key-value pairs, # num_hiddens). Shape of output X: (batch_size, no. of queries or # key-value pairs, num_heads, num_hiddens / num_heads) X = tf.reshape(X, shape=(X.shape[0], X.shape[1], self.num_heads, -1)) # Shape of output X: (batch_size, num_heads, no. of queries or key-value # pairs, num_hiddens / num_heads) X = tf.transpose(X, perm=(0, 2, 1, 3)) # Shape of output: (batch_size * num_heads, no. of queries or key-value # pairs, num_hiddens / num_heads) return tf.reshape(X, shape=(-1, X.shape[2], X.shape[3])) def transpose_output(self, X): """Reverse the operation of transpose_qkv. Defined in :numref:`sec_multihead-attention`""" X = tf.reshape(X, shape=(-1, self.num_heads, X.shape[1], X.shape[2])) X = tf.transpose(X, perm=(0, 2, 1, 3)) return tf.reshape(X, shape=(X.shape[0], X.shape[1], -1)) class PositionalEncoding(tf.keras.layers.Layer): """Positional encoding. Defined in :numref:`sec_self-attention-and-positional-encoding`""" def __init__(self, num_hiddens, dropout, max_len=1000): super().__init__() self.dropout = tf.keras.layers.Dropout(dropout) # Create a long enough P self.P = np.zeros((1, max_len, num_hiddens)) X = np.arange(max_len, dtype=np.float32).reshape( -1,1)/np.power(10000, np.arange( 0, num_hiddens, 2, dtype=np.float32) / num_hiddens) self.P[:, :, 0::2] = np.sin(X) self.P[:, :, 1::2] = np.cos(X) def call(self, X, **kwargs): X = X + self.P[:, :X.shape[1], :] return self.dropout(X, **kwargs) class PositionWiseFFN(tf.keras.layers.Layer): """The positionwise feed-forward network. Defined in :numref:`sec_transformer`""" def __init__(self, ffn_num_hiddens, ffn_num_outputs): super().__init__() self.dense1 = tf.keras.layers.Dense(ffn_num_hiddens) self.relu = tf.keras.layers.ReLU() self.dense2 = tf.keras.layers.Dense(ffn_num_outputs) def call(self, X): return self.dense2(self.relu(self.dense1(X))) class AddNorm(tf.keras.layers.Layer): """The residual connection followed by layer normalization. Defined in :numref:`subsec_positionwise-ffn`""" def __init__(self, norm_shape, dropout): super().__init__() self.dropout = tf.keras.layers.Dropout(dropout) self.ln = tf.keras.layers.LayerNormalization(norm_shape) def call(self, X, Y, **kwargs): return self.ln(self.dropout(Y, **kwargs) + X) class TransformerEncoderBlock(tf.keras.layers.Layer): """The Transformer encoder block. Defined in :numref:`subsec_positionwise-ffn`""" def __init__(self, key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_hiddens, num_heads, dropout, bias=False): super().__init__() self.attention = d2l.MultiHeadAttention( key_size, query_size, value_size, num_hiddens, num_heads, dropout, bias) self.addnorm1 = AddNorm(norm_shape, dropout) self.ffn = PositionWiseFFN(ffn_num_hiddens, num_hiddens) self.addnorm2 = AddNorm(norm_shape, dropout) def call(self, X, valid_lens, **kwargs): Y = self.addnorm1(X, self.attention(X, X, X, valid_lens, **kwargs), **kwargs) return self.addnorm2(Y, self.ffn(Y), **kwargs) class TransformerEncoder(d2l.Encoder): """The Transformer encoder. Defined in :numref:`subsec_transformer-encoder`""" def __init__(self, vocab_size, key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_hiddens, num_heads, num_blks, dropout, bias=False): super().__init__() self.num_hiddens = num_hiddens self.embedding = tf.keras.layers.Embedding(vocab_size, num_hiddens) self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout) self.blks = [TransformerEncoderBlock( key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_hiddens, num_heads, dropout, bias) for _ in range( num_blks)] def call(self, X, valid_lens, **kwargs): # Since positional encoding values are between -1 and 1, the embedding # values are multiplied by the square root of the embedding dimension # to rescale before they are summed up X = self.pos_encoding(self.embedding(X) * tf.math.sqrt( tf.cast(self.num_hiddens, dtype=tf.float32)), **kwargs) self.attention_weights = [None] * len(self.blks) for i, blk in enumerate(self.blks): X = blk(X, valid_lens, **kwargs) self.attention_weights[ i] = blk.attention.attention.attention_weights return X def annotate(text, xy, xytext): """Defined in :numref:`sec_optimization-intro`""" d2l.plt.gca().annotate(text, xy=xy, xytext=xytext, arrowprops=dict(arrowstyle='->')) def train_2d(trainer, steps=20, f_grad=None): """Optimize a 2D objective function with a customized trainer. Defined in :numref:`subsec_gd-learningrate`""" # `s1` and `s2` are internal state variables that will be used in Momentum, adagrad, RMSProp x1, x2, s1, s2 = -5, -2, 0, 0 results = [(x1, x2)] for i in range(steps): if f_grad: x1, x2, s1, s2 = trainer(x1, x2, s1, s2, f_grad) else: x1, x2, s1, s2 = trainer(x1, x2, s1, s2) results.append((x1, x2)) print(f'epoch {i + 1}, x1: {float(x1):f}, x2: {float(x2):f}') return results def show_trace_2d(f, results): """Show the trace of 2D variables during optimization. Defined in :numref:`subsec_gd-learningrate`""" d2l.set_figsize() d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e') x1, x2 = d2l.meshgrid(d2l.arange(-5.5, 1.0, 0.1), d2l.arange(-3.0, 1.0, 0.1)) d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4') d2l.plt.xlabel('x1') d2l.plt.ylabel('x2') class Timer: """Record multiple running times.""" def __init__(self): """Defined in :numref:`sec_minibatch_sgd`""" self.times = [] self.start() def start(self): """Start the timer.""" self.tik = time.time() def stop(self): """Stop the timer and record the time in a list.""" self.times.append(time.time() - self.tik) return self.times[-1] def avg(self): """Return the average time.""" return sum(self.times) / len(self.times) def sum(self): """Return the sum of time.""" return sum(self.times) def cumsum(self): """Return the accumulated time.""" return np.array(self.times).cumsum().tolist() d2l.DATA_HUB['airfoil'] = (d2l.DATA_URL + 'airfoil_self_noise.dat', '76e5be1548fd8222e5074cf0faae75edff8cf93f') def get_data_ch11(batch_size=10, n=1500): """Defined in :numref:`sec_minibatches`""" data = np.genfromtxt(d2l.download('airfoil'), dtype=np.float32, delimiter='\t') data = (data - data.mean(axis=0)) / data.std(axis=0) data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]), batch_size, is_train=True) return data_iter, data.shape[1]-1 def train_ch11(trainer_fn, states, hyperparams, data_iter, feature_dim, num_epochs=2): """Defined in :numref:`sec_minibatches`""" # Initialization w = tf.Variable(tf.random.normal(shape=(feature_dim, 1), mean=0, stddev=0.01),trainable=True) b = tf.Variable(tf.zeros(1), trainable=True) # Train net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[0, num_epochs], ylim=[0.22, 0.35]) n, timer = 0, d2l.Timer() for _ in range(num_epochs): for X, y in data_iter: with tf.GradientTape() as g: l = tf.math.reduce_mean(loss(net(X), y)) dw, db = g.gradient(l, [w, b]) trainer_fn([w, b], [dw, db], states, hyperparams) n += X.shape[0] if n % 200 == 0: timer.stop() p = n/X.shape[0] q = p/tf.data.experimental.cardinality(data_iter).numpy() r = (d2l.evaluate_loss(net, data_iter, loss),) animator.add(q, r) timer.start() print(f'loss: {animator.Y[0][-1]:.3f}, {timer.sum()/num_epochs:.3f} sec/epoch') return timer.cumsum(), animator.Y[0] def train_concise_ch11(trainer_fn, hyperparams, data_iter, num_epochs=2): """Defined in :numref:`sec_minibatches`""" # Initialization net = tf.keras.Sequential() net.add(tf.keras.layers.Dense(1, kernel_initializer=tf.random_normal_initializer(stddev=0.01))) optimizer = trainer_fn(**hyperparams) loss = tf.keras.losses.MeanSquaredError() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[0, num_epochs], ylim=[0.22, 0.35]) n, timer = 0, d2l.Timer() for _ in range(num_epochs): for X, y in data_iter: with tf.GradientTape() as g: out = net(X) l = loss(y, out) params = net.trainable_variables grads = g.gradient(l, params) optimizer.apply_gradients(zip(grads, params)) n += X.shape[0] if n % 200 == 0: timer.stop() p = n/X.shape[0] q = p/tf.data.experimental.cardinality(data_iter).numpy() # `MeanSquaredError` computes squared error without the 1/2 # factor r = (d2l.evaluate_loss(net, data_iter, loss) / 2,) animator.add(q, r) timer.start() print(f'loss: {animator.Y[0][-1]:.3f}, {timer.sum()/num_epochs:.3f} sec/epoch') class Benchmark: """For measuring running time.""" def __init__(self, description='Done'): """Defined in :numref:`sec_hybridize`""" self.description = description def __enter__(self): self.timer = d2l.Timer() return self def __exit__(self, *args): print(f'{self.description}: {self.timer.stop():.4f} sec') def box_corner_to_center(boxes): """Convert from (upper-left, lower-right) to (center, width, height). Defined in :numref:`sec_bbox`""" x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] cx = (x1 + x2) / 2 cy = (y1 + y2) / 2 w = x2 - x1 h = y2 - y1 boxes = d2l.stack((cx, cy, w, h), axis=-1) return boxes def box_center_to_corner(boxes): """Convert from (center, width, height) to (upper-left, lower-right). Defined in :numref:`sec_bbox`""" cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] x1 = cx - 0.5 * w y1 = cy - 0.5 * h x2 = cx + 0.5 * w y2 = cy + 0.5 * h boxes = d2l.stack((x1, y1, x2, y2), axis=-1) return boxes def bbox_to_rect(bbox, color): """Convert bounding box to matplotlib format. Defined in :numref:`sec_bbox`""" # Convert the bounding box (upper-left x, upper-left y, lower-right x, # lower-right y) format to the matplotlib format: ((upper-left x, # upper-left y), width, height) return d2l.plt.Rectangle( xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], fill=False, edgecolor=color, linewidth=2) def update_D(X, Z, net_D, net_G, loss, optimizer_D): """Update discriminator. Defined in :numref:`sec_basic_gan`""" batch_size = X.shape[0] ones = tf.ones((batch_size,)) # Labels corresponding to real data zeros = tf.zeros((batch_size,)) # Labels corresponding to fake data # Do not need to compute gradient for `net_G`, so it is outside GradientTape fake_X = net_G(Z) with tf.GradientTape() as tape: real_Y = net_D(X) fake_Y = net_D(fake_X) # We multiply the loss by batch_size to match PyTorch's BCEWithLogitsLoss loss_D = (loss(ones, tf.squeeze(real_Y)) + loss( zeros, tf.squeeze(fake_Y))) * batch_size / 2 grads_D = tape.gradient(loss_D, net_D.trainable_variables) optimizer_D.apply_gradients(zip(grads_D, net_D.trainable_variables)) return loss_D def update_G(Z, net_D, net_G, loss, optimizer_G): """Update generator. Defined in :numref:`sec_basic_gan`""" batch_size = Z.shape[0] ones = tf.ones((batch_size,)) with tf.GradientTape() as tape: # We could reuse `fake_X` from `update_D` to save computation fake_X = net_G(Z) # Recomputing `fake_Y` is needed since `net_D` is changed fake_Y = net_D(fake_X) # We multiply the loss by batch_size to match PyTorch's BCEWithLogits loss loss_G = loss(ones, tf.squeeze(fake_Y)) * batch_size grads_G = tape.gradient(loss_G, net_G.trainable_variables) optimizer_G.apply_gradients(zip(grads_G, net_G.trainable_variables)) return loss_G d2l.DATA_HUB['pokemon'] = (d2l.DATA_URL + 'pokemon.zip', 'c065c0e2593b8b161a2d7873e42418bf6a21106c') def load_array(data_arrays, batch_size, is_train=True): """Construct a TensorFlow data iterator. Defined in :numref:`sec_utils`""" dataset = tf.data.Dataset.from_tensor_slices(data_arrays) if is_train: dataset = dataset.shuffle(buffer_size=1000) dataset = dataset.batch(batch_size) return dataset def synthetic_data(w, b, num_examples): """Generate y = Xw + b + noise. Defined in :numref:`sec_utils`""" X = tf.zeros((num_examples, w.shape[0])) X += tf.random.normal(shape=X.shape) y = tf.matmul(X, tf.reshape(w, (-1, 1))) + b y += tf.random.normal(shape=y.shape, stddev=0.01) y = tf.reshape(y, (-1, 1)) return X, y def sgd(params, grads, lr, batch_size): """Minibatch stochastic gradient descent. Defined in :numref:`sec_utils`""" for param, grad in zip(params, grads): param.assign_sub(lr * grad / batch_size) def load_data_fashion_mnist(batch_size, resize=None): """Download the Fashion-MNIST dataset and then load it into memory. Defined in :numref:`sec_utils`""" mnist_train, mnist_test = tf.keras.datasets.fashion_mnist.load_data() # Divide all numbers by 255 so that all pixel values are between # 0 and 1, add a batch dimension at the last. And cast label to int32 process = lambda X, y: (tf.expand_dims(X, axis=3) / 255, tf.cast(y, dtype='int32')) resize_fn = lambda X, y: ( tf.image.resize_with_pad(X, resize, resize) if resize else X, y) return ( tf.data.Dataset.from_tensor_slices(process(*mnist_train)).batch( batch_size).shuffle(len(mnist_train[0])).map(resize_fn), tf.data.Dataset.from_tensor_slices(process(*mnist_test)).batch( batch_size).map(resize_fn)) class TrainCallback(tf.keras.callbacks.Callback): """A callback to visiualize the training progress. Defined in :numref:`sec_utils`""" def __init__(self, net, train_iter, test_iter, num_epochs, device_name): self.timer = d2l.Timer() self.animator = d2l.Animator( xlabel='epoch', xlim=[1, num_epochs], legend=[ 'train loss', 'train acc', 'test acc']) self.net = net self.train_iter = train_iter self.test_iter = test_iter self.num_epochs = num_epochs self.device_name = device_name def on_epoch_begin(self, epoch, logs=None): self.timer.start() def on_epoch_end(self, epoch, logs): self.timer.stop() test_acc = self.net.evaluate( self.test_iter, verbose=0, return_dict=True)['accuracy'] metrics = (logs['loss'], logs['accuracy'], test_acc) self.animator.add(epoch + 1, metrics) if epoch == self.num_epochs - 1: batch_size = next(iter(self.train_iter))[0].shape[0] num_examples = batch_size * tf.data.experimental.cardinality( self.train_iter).numpy() print(f'loss {metrics[0]:.3f}, train acc {metrics[1]:.3f}, ' f'test acc {metrics[2]:.3f}') print(f'{num_examples / self.timer.avg():.1f} examples/sec on ' f'{str(self.device_name)}') def train_ch6(net_fn, train_iter, test_iter, num_epochs, lr, device): """Train a model with a GPU (defined in Chapter 6). Defined in :numref:`sec_utils`""" device_name = device._device_name strategy = tf.distribute.OneDeviceStrategy(device_name) with strategy.scope(): optimizer = tf.keras.optimizers.SGD(learning_rate=lr) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) net = net_fn() net.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) callback = TrainCallback(net, train_iter, test_iter, num_epochs, device_name) net.fit(train_iter, epochs=num_epochs, verbose=0, callbacks=[callback]) return net def evaluate_accuracy(net, data_iter): """Compute the accuracy for a model on a dataset. Defined in :numref:`sec_utils`""" metric = Accumulator(2) # No. of correct predictions, no. of predictions for X, y in data_iter: metric.add(accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1] def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): """Plot a list of images. Defined in :numref:`sec_utils`""" figsize = (num_cols * scale, num_rows * scale) _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize) axes = axes.flatten() for i, (ax, img) in enumerate(zip(axes, imgs)): try: img = d2l.numpy(img) except: pass ax.imshow(img) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) if titles: ax.set_title(titles[i]) return axes def linreg(X, w, b): """The linear regression model. Defined in :numref:`sec_utils`""" return d2l.matmul(X, w) + b def squared_loss(y_hat, y): """Squared loss. Defined in :numref:`sec_utils`""" return (y_hat - d2l.reshape(y, y_hat.shape)) ** 2 / 2 def get_fashion_mnist_labels(labels): """Return text labels for the Fashion-MNIST dataset. Defined in :numref:`sec_utils`""" text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [text_labels[int(i)] for i in labels] class Animator: """For plotting data in animation.""" def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, figsize=(3.5, 2.5)): """Defined in :numref:`sec_utils`""" # Incrementally plot multiple lines if legend is None: legend = [] d2l.use_svg_display() self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) if nrows * ncols == 1: self.axes = [self.axes, ] # Use a lambda function to capture arguments self.config_axes = lambda: d2l.set_axes( self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) self.X, self.Y, self.fmts = None, None, fmts def add(self, x, y): # Add multiple data points into the figure if not hasattr(y, "__len__"): y = [y] n = len(y) if not hasattr(x, "__len__"): x = [x] * n if not self.X: self.X = [[] for _ in range(n)] if not self.Y: self.Y = [[] for _ in range(n)] for i, (a, b) in enumerate(zip(x, y)): if a is not None and b is not None: self.X[i].append(a) self.Y[i].append(b) self.axes[0].cla() for x, y, fmt in zip(self.X, self.Y, self.fmts): self.axes[0].plot(x, y, fmt) self.config_axes() display.display(self.fig) display.clear_output(wait=True) class Accumulator: """For accumulating sums over `n` variables.""" def __init__(self, n): """Defined in :numref:`sec_utils`""" self.data = [0.0] * n def add(self, *args): self.data = [a + float(b) for a, b in zip(self.data, args)] def reset(self): self.data = [0.0] * len(self.data) def __getitem__(self, idx): return self.data[idx] def accuracy(y_hat, y): """Compute the number of correct predictions. Defined in :numref:`sec_utils`""" if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: y_hat = d2l.argmax(y_hat, axis=1) cmp = d2l.astype(y_hat, y.dtype) == y return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype))) def download(url, folder='../data', sha1_hash=None): """Download a file to folder and return the local filepath. Defined in :numref:`sec_utils`""" if not url.startswith('http'): # For back compatability url, sha1_hash = DATA_HUB[url] os.makedirs(folder, exist_ok=True) fname = os.path.join(folder, url.split('/')[-1]) # Check if hit cache if os.path.exists(fname) and sha1_hash: sha1 = hashlib.sha1() with open(fname, 'rb') as f: while True: data = f.read(1048576) if not data: break sha1.update(data) if sha1.hexdigest() == sha1_hash: return fname # Download print(f'Downloading {fname} from {url}...') r = requests.get(url, stream=True, verify=True) with open(fname, 'wb') as f: f.write(r.content) return fname def extract(filename, folder=None): """Extract a zip/tar file into folder. Defined in :numref:`sec_utils`""" base_dir = os.path.dirname(filename) _, ext = os.path.splitext(filename) assert ext in ('.zip', '.tar', '.gz'), 'Only support zip/tar files.' if ext == '.zip': fp = zipfile.ZipFile(filename, 'r') else: fp = tarfile.open(filename, 'r') if folder is None: folder = base_dir fp.extractall(folder) def download_extract(name, folder=None): """Download and extract a zip/tar file. Defined in :numref:`sec_utils`""" fname = download(name) base_dir = os.path.dirname(fname) data_dir, ext = os.path.splitext(fname) if ext == '.zip': fp = zipfile.ZipFile(fname, 'r') elif ext in ('.tar', '.gz'): fp = tarfile.open(fname, 'r') else: assert False, 'Only zip/tar files can be extracted.' fp.extractall(base_dir) return os.path.join(base_dir, folder) if folder else data_dir def tokenize(lines, token='word'): """Split text lines into word or character tokens. Defined in :numref:`sec_utils`""" assert token in ('word', 'char'), 'Unknown token type: ' + token return [line.split() if token == 'word' else list(line) for line in lines] def evaluate_loss(net, data_iter, loss): """Evaluate the loss of a model on the given dataset. Defined in :numref:`sec_utils`""" metric = d2l.Accumulator(2) # Sum of losses, no. of examples for X, y in data_iter: l = loss(net(X), y) metric.add(d2l.reduce_sum(l), d2l.size(l)) return metric[0] / metric[1] def grad_clipping(grads, theta): """Clip the gradient. Defined in :numref:`sec_utils`""" theta = tf.constant(theta, dtype=tf.float32) new_grad = [] for grad in grads: if isinstance(grad, tf.IndexedSlices): new_grad.append(tf.convert_to_tensor(grad)) else: new_grad.append(grad) norm = tf.math.sqrt(sum((tf.reduce_sum(grad ** 2)).numpy() for grad in new_grad)) norm = tf.cast(norm, tf.float32) if tf.greater(norm, theta): for i, grad in enumerate(new_grad): new_grad[i] = grad * theta / norm else: new_grad = new_grad return new_grad d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip', '94646ad1522d915e7b0f9296181140edcf86a4f5') def read_data_nmt(): """Load the English-French dataset. Defined in :numref:`sec_utils`""" data_dir = d2l.download_extract('fra-eng') with open(os.path.join(data_dir, 'fra.txt'), 'r', encoding='utf-8') as f: return f.read() def preprocess_nmt(text): """Preprocess the English-French dataset. Defined in :numref:`sec_utils`""" def no_space(char, prev_char): return char in set(',.!?') and prev_char != ' ' # Replace non-breaking space with space, and convert uppercase letters to # lowercase ones text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower() # Insert space between words and punctuation marks out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)] return ''.join(out) def tokenize_nmt(text, num_examples=None): """Tokenize the English-French dataset. Defined in :numref:`sec_utils`""" source, target = [], [] for i, line in enumerate(text.split('\n')): if num_examples and i > num_examples: break parts = line.split('\t') if len(parts) == 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' ')) return source, target def truncate_pad(line, num_steps, padding_token): """Truncate or pad sequences. Defined in :numref:`sec_utils`""" if len(line) > num_steps: return line[:num_steps] # Truncate return line + [padding_token] * (num_steps - len(line)) # Pad def build_array_nmt(lines, vocab, num_steps): """Transform text sequences of machine translation into minibatches. Defined in :numref:`sec_utils`""" lines = [vocab[l] for l in lines] lines = [l + [vocab['']] for l in lines] array = d2l.tensor([truncate_pad( l, num_steps, vocab['']) for l in lines]) valid_len = d2l.reduce_sum( d2l.astype(array != vocab[''], d2l.int32), 1) return array, valid_len def load_data_nmt(batch_size, num_steps, num_examples=600): """Return the iterator and the vocabularies of the translation dataset. Defined in :numref:`sec_utils`""" text = preprocess_nmt(read_data_nmt()) source, target = tokenize_nmt(text, num_examples) src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['', '', '']) tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['', '', '']) src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps) tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps) data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) data_iter = d2l.load_array(data_arrays, batch_size) return data_iter, src_vocab, tgt_vocab def sequence_mask(X, valid_len, value=0): """Mask irrelevant entries in sequences. Defined in :numref:`sec_utils`""" maxlen = X.shape[1] mask = tf.range(start=0, limit=maxlen, dtype=tf.float32)[ None, :] < tf.cast(valid_len[:, None], dtype=tf.float32) if len(X.shape) == 3: return tf.where(tf.expand_dims(mask, axis=-1), X, value) else: return tf.where(mask, X, value) class MaskedSoftmaxCELoss(tf.keras.losses.Loss): """The softmax cross-entropy loss with masks. Defined in :numref:`sec_utils`""" def __init__(self, valid_len): super().__init__(reduction='none') self.valid_len = valid_len # `pred` shape: (`batch_size`, `num_steps`, `vocab_size`) # `label` shape: (`batch_size`, `num_steps`) # `valid_len` shape: (`batch_size`,) def call(self, label, pred): weights = tf.ones_like(label, dtype=tf.float32) weights = sequence_mask(weights, self.valid_len) label_one_hot = tf.one_hot(label, depth=pred.shape[-1]) unweighted_loss = tf.keras.losses.CategoricalCrossentropy( from_logits=True, reduction='none')(label_one_hot, pred) weighted_loss = tf.reduce_mean((unweighted_loss*weights), axis=1) return weighted_loss def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device): """Train a model for sequence to sequence. Defined in :numref:`sec_utils`""" optimizer = tf.keras.optimizers.Adam(learning_rate=lr) animator = d2l.Animator(xlabel="epoch", ylabel="loss", xlim=[10, num_epochs]) for epoch in range(num_epochs): timer = d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens for batch in data_iter: X, X_valid_len, Y, Y_valid_len = [x for x in batch] bos = tf.reshape(tf.constant([tgt_vocab['']] * Y.shape[0]), shape=(-1, 1)) dec_input = tf.concat([bos, Y[:, :-1]], 1) # Teacher forcing with tf.GradientTape() as tape: Y_hat, _ = net(X, dec_input, X_valid_len, training=True) l = MaskedSoftmaxCELoss(Y_valid_len)(Y, Y_hat) gradients = tape.gradient(l, net.trainable_variables) gradients = d2l.grad_clipping(gradients, 1) optimizer.apply_gradients(zip(gradients, net.trainable_variables)) num_tokens = tf.reduce_sum(Y_valid_len).numpy() metric.add(tf.reduce_sum(l), num_tokens) if (epoch + 1) % 10 == 0: animator.add(epoch + 1, (metric[0] / metric[1],)) print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device._device_name)}') def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, save_attention_weights=False): """Predict for sequence to sequence. Defined in :numref:`sec_utils`""" src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ src_vocab['']] enc_valid_len = tf.constant([len(src_tokens)]) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['']) # Add the batch axis enc_X = tf.expand_dims(src_tokens, axis=0) enc_outputs = net.encoder(enc_X, enc_valid_len, training=False) dec_state = net.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = tf.expand_dims(tf.constant([tgt_vocab['']]), axis=0) output_seq, attention_weight_seq = [], [] for _ in range(num_steps): Y, dec_state = net.decoder(dec_X, dec_state, training=False) # We use the token with the highest prediction likelihood as input # of the decoder at the next time step dec_X = tf.argmax(Y, axis=2) pred = tf.squeeze(dec_X, axis=0) # Save attention weights if save_attention_weights: attention_weight_seq.append(net.decoder.attention_weights) # Once the end-of-sequence token is predicted, the generation of the # output sequence is complete if pred == tgt_vocab['']: break output_seq.append(pred.numpy()) return ' '.join(tgt_vocab.to_tokens(tf.reshape(output_seq, shape = -1).numpy().tolist())), attention_weight_seq # Alias defined in config.ini size = lambda a: tf.size(a).numpy() reshape = tf.reshape ones_like = tf.ones_like ones = tf.ones zeros_like = tf.zeros_like zeros = tf.zeros meshgrid = tf.meshgrid sin = tf.sin sinh = tf.sinh cos = tf.cos cosh = tf.cosh tanh = tf.tanh linspace = tf.linspace exp = tf.exp normal = tf.random.normal rand = tf.random.uniform matmul = tf.matmul reduce_sum = tf.reduce_sum reduce_mean = tf.reduce_mean argmax = tf.argmax tensor = tf.constant arange = tf.range astype = tf.cast int32 = tf.int32 int64 = tf.int64 float32 = tf.float32 transpose = tf.transpose concat = tf.concat stack = tf.stack abs = tf.abs eye = tf.eye log = tf.math.log sigmoid = tf.sigmoid expand_dims = tf.expand_dims repeat = tf.repeat batch_matmul = tf.matmul numpy = lambda x, *args, **kwargs: x.numpy(*args, **kwargs)