Update documentation to new API.

2015-10-05 16:28:17 -07:00 · 2015-10-05 16:28:17 -07:00 · 0b8a52e463
commit 0b8a52e463
parent cb77f7d7e2
17 changed files with 271 additions and 222 deletions
--- a/README.md
+++ b/README.md
@ -34,13 +34,16 @@ from keras.layers.core import Dense, Dropout, Activation
 from keras.optimizers import SGD
 model = Sequential()
-model.add(Dense(20, 64, init='uniform'))
+# Dense(64) is a fully-connected layer with 64 hidden units.
 # in the first layer, you must specify the expected input data shape:
 # here, 20-dimensional vectors.
 model.add(Dense(64, input_dim=20, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform'))
+model.add(Dense(2, init='uniform'))
 model.add(Activation('softmax'))
 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@ -54,11 +57,11 @@ score = model.evaluate(X_test, y_test, batch_size=16)
 ```python
 model = Sequential()
-model.add(Dense(20, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, input_dim=20, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform', activation='softmax'))
+model.add(Dense(2, init='uniform', activation='softmax'))
 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='mean_squared_error', optimizer=sgd)
@ -73,26 +76,29 @@ from keras.layers.convolutional import Convolution2D, MaxPooling2D
 from keras.optimizers import SGD
 model = Sequential()
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
+# input: 100x100 images with 3 channels -> (3, 100, 100) tensors.
 # this applies 32 convolution filters of size 3x3 each.
 model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100))) 
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
+model.add(Convolution2D(32, 3, 3))
 model.add(Activation('relu'))
 model.add(MaxPooling2D(poolsize=(2, 2)))
 model.add(Dropout(0.25))
-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+model.add(Convolution2D(64, 3, 3, border_mode='valid')) 
 model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
+model.add(Convolution2D(64, 3, 3)) 
 model.add(Activation('relu'))
 model.add(MaxPooling2D(poolsize=(2, 2)))
 model.add(Dropout(0.25))
 model.add(Flatten())
-model.add(Dense(64*8*8, 256))
+# Note: Keras does automatic shape inference.
 model.add(Dense(256))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-model.add(Dense(256, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))
 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@ -112,9 +118,9 @@ from keras.layers.recurrent import LSTM
 model = Sequential()
 model.add(Embedding(max_features, 256))
-model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
+model.add(LSTM(output_dim=128, activation='sigmoid', inner_activation='hard_sigmoid'))
 model.add(Dropout(0.5))
-model.add(Dense(128, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))
 model.compile(loss='binary_crossentropy', optimizer='rmsprop')
@ -126,51 +132,67 @@ score = model.evaluate(X_test, Y_test, batch_size=16)
 ### Architecture for learning image captions with a convnet and a Gated Recurrent Unit:
 (word-level embedding, caption of maximum length 16 words).
-Note that getting this to actually "work" will require using a bigger convnet, initialized with pre-trained weights.
+Note that getting this to work well will require using a bigger convnet, initialized with pre-trained weights.
 Displaying readable results will also require an embedding decoder.
 ```python
 max_caption_len = 16
 vocab_size = 10000
-model = Sequential()
+# first, let's define an image model that
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
+# will encode pictures into 128-dimensional vectors.
-model.add(Activation('relu'))
+# it should be initialized with pre-trained weights.
-model.add(Convolution2D(32, 32, 3, 3))
+image_model = Sequential()
-model.add(Activation('relu'))
+image_model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100))) 
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(Activation('relu'))
 image_model.add(Convolution2D(32, 3, 3))
 image_model.add(Activation('relu'))
 image_model.add(MaxPooling2D(poolsize=(2, 2)))
-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+image_model.add(Convolution2D(64, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
+image_model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
+image_model.add(Convolution2D(64, 3, 3)) 
-model.add(Activation('relu'))
+image_model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(MaxPooling2D(poolsize=(2, 2)))
-model.add(Convolution2D(128, 64, 3, 3, border_mode='full')) 
+image_model.add(Flatten())
-model.add(Activation('relu'))
+image_model.add(Dense(128))
 model.add(Convolution2D(128, 128, 3, 3)) 
 model.add(Activation('relu'))
 model.add(MaxPooling2D(poolsize=(2, 2)))
-model.add(Flatten())
+# let's load the weights from a save file.
-model.add(Dense(128*4*4, 256))
+image_model.load_weights('weight_file.h5')
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-model.add(RepeatVector(max_caption_len)) 
+# next, let's define a RNN model that encodes sequences of words
-# the GRU below returns sequences of max_caption_len vectors of size 256 (our word embedding size)
+# into sequences of 128-dimensional word vectors.
-model.add(GRU(256, 256, return_sequences=True))
+language_model = Sequential()
 language_model.add(Embedding(vocab_size, 256, input_length=max_caption_len))
 language_model.add(GRU(output_dim=128, return_sequences=True))
 language_model.add(Dense(128))
-model.compile(loss='mean_squared_error', optimizer='rmsprop')
+# let's repeat the image vector to turn it into a sequence.
 image_model.add(RepeatVector(max_caption_len)) 
-# "images" is a numpy array of shape (nb_samples, nb_channels=3, width, height) 
+# the output of both models will be tensors of shape (samples, max_caption_len, 128).
-# "captions" is a numpy array of shape (nb_samples, max_caption_len=16, embedding_dim=256)
+# let's concatenate these 2 vector sequences.
-# captions are supposed already embedded (dense vectors).
+model = Merge([image_model, language_model], mode='concat', concat_axis=-1)
-model.fit(images, captions, batch_size=16, nb_epoch=100)
+# let's encode this vector sequence into a single vector
 model.add(GRU(256, 256, return_sequences=False))
 # which will be used to compute a probability 
 # distribution over what the next word in the caption should be!
 model.add(Dense(vocab_size))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 # "images" is a numpy float array of shape (nb_samples, nb_channels=3, width, height).
 # "captions" is a numpy integer array of shape (nb_samples, max_caption_len)
 # containing word index sequences representing partial captions.
 # "next_words" is a numpy float array of shape (nb_samples, vocab_size)
 # containing a categorical encoding (0s and 1s) of the next word in the corresponding
 # partial caption.
 model.fit([images, partial_captions], next_words, batch_size=16, nb_epoch=100)
 ```
 In the examples folder, you will find example models for real datasets:
- CIFAR10 small images classification: Convnet with realtime data augmentation
+- CIFAR10 small images classification: Convolutional Neural Network (CNN) with realtime data augmentation
 - IMDB movie review sentiment classification: LSTM over sequences of words
 - Reuters newswires topic classification: Multilayer Perceptron (MLP)
 - MNIST handwritten digits classification: MLP & CNN
@ -183,7 +205,7 @@ In the examples folder, you will find example models for real datasets:
 For complete coverage of the API, check out [the Keras documentation](http://keras.io).
-A few highlights: convnets, LSTM, GRU, word2vec-style embeddings, PReLU, batch normalization...
+A few highlights: convnets, LSTM, GRU, word2vec-style embeddings, PReLU, BatchNormalization...
 ## Installation
@ -196,7 +218,7 @@ Keras uses the following dependencies:
 - HDF5 and h5py (optional, required if you use model saving/loading functions)
 - Optional but recommended if you use CNNs: cuDNN.
-Once you have the dependencies installed, cd to the Keras folder and run the install command:
+To install, `cd` to the Keras folder and run the install command:
 ```
 sudo python setup.py install
 ```
--- a/docs/sources/activations.md
+++ b/docs/sources/activations.md
@ -6,12 +6,12 @@ Activations can either be used through an `Activation` layer, or through the `ac
 ```python
 from keras.layers.core import Activation, Dense
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64))
 model.add(Activation('tanh'))
 ```
 is equivalent to:
 ```python
-model.add(Dense(20, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, activation='tanh'))
 ```
 You can also pass an element-wise Theano function as an activation:
@ -20,7 +20,7 @@ You can also pass an element-wise Theano function as an activation:
 def tanh(x):
    return theano.tensor.tanh(x)
-model.add(Dense(20, 64, init='uniform', activation=tanh))
+model.add(Dense(64, activation=tanh))
 model.add(Activation(tanh))
 ```
--- a/docs/sources/callbacks.md
+++ b/docs/sources/callbacks.md
@ -75,7 +75,7 @@ class LossHistory(keras.callbacks.Callback):
        self.losses.append(logs.get('loss'))
 model = Sequential()
-model.add(Dense(784, 10, init='uniform'))
+model.add(Dense(10, input_dim=784, init='uniform'))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@ -97,7 +97,7 @@ print history.losses
 from keras.callbacks import ModelCheckpoint
 model = Sequential()
-model.add(Dense(784, 10, init='uniform'))
+model.add(Dense(10, input_dim=784, init='uniform'))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
--- a/docs/sources/constraints.md
+++ b/docs/sources/constraints.md
@ -12,7 +12,7 @@ These layers expose 2 keyword arguments:
 ```python
 from keras.constraints import maxnorm
-model.add(Dense(64, 64, W_constraint = maxnorm(2)))
+model.add(Dense(64, W_constraint = maxnorm(2)))
 ```
 ## Available constraints
--- a/docs/sources/examples.md
+++ b/docs/sources/examples.md
@ -1,7 +1,7 @@
 Here are a few examples to get you started!
-### Multilayer Perceptron (MLP)
+### Multilayer Perceptron (MLP):
 ```python
 from keras.models import Sequential
@ -9,13 +9,16 @@ from keras.layers.core import Dense, Dropout, Activation
 from keras.optimizers import SGD
 model = Sequential()
-model.add(Dense(20, 64, init='uniform'))
+# Dense(64) is a fully-connected layer with 64 hidden units.
 # in the first layer, you must specify the expected input data shape:
 # here, 20-dimensional vectors.
 model.add(Dense(64, input_dim=20, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform'))
+model.add(Dense(2, init='uniform'))
 model.add(Activation('softmax'))
 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@ -25,25 +28,21 @@ model.fit(X_train, y_train, nb_epoch=20, batch_size=16)
 score = model.evaluate(X_test, y_test, batch_size=16)
 ```
---
+### Alternative implementation of MLP:
 ### Alternative implementation of MLP
 ```python
 model = Sequential()
-model.add(Dense(20, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, input_dim=20, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform', activation='softmax'))
+model.add(Dense(2, init='uniform', activation='softmax'))
 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='mean_squared_error', optimizer=sgd)
 ```
---
+### VGG-like convnet:
 ### VGG-like convnet
 ```python
 from keras.models import Sequential
@ -52,26 +51,29 @@ from keras.layers.convolutional import Convolution2D, MaxPooling2D
 from keras.optimizers import SGD
 model = Sequential()
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
+# input: 100x100 images with 3 channels -> (3, 100, 100) tensors.
 # this applies 32 convolution filters of size 3x3 each.
 model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100))) 
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
+model.add(Convolution2D(32, 3, 3))
 model.add(Activation('relu'))
 model.add(MaxPooling2D(poolsize=(2, 2)))
 model.add(Dropout(0.25))
-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+model.add(Convolution2D(64, 3, 3, border_mode='valid')) 
 model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
+model.add(Convolution2D(64, 3, 3)) 
 model.add(Activation('relu'))
 model.add(MaxPooling2D(poolsize=(2, 2)))
 model.add(Dropout(0.25))
 model.add(Flatten())
-model.add(Dense(64*8*8, 256))
+# Note: Keras does automatic shape inference.
 model.add(Dense(256))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-model.add(Dense(256, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))
 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@ -81,9 +83,7 @@ model.fit(X_train, Y_train, batch_size=32, nb_epoch=1)
 ```
---
+### Sequence classification with LSTM:
 ### Sequence classification with LSTM
 ```python
 from keras.models import Sequential
@ -92,11 +92,10 @@ from keras.layers.embeddings import Embedding
 from keras.layers.recurrent import LSTM
 model = Sequential()
 # Add a mask_zero=True to the Embedding connstructor if 0 is a left-padding value in your data
 model.add(Embedding(max_features, 256))
-model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
+model.add(LSTM(output_dim=128, activation='sigmoid', inner_activation='hard_sigmoid'))
 model.add(Dropout(0.5))
-model.add(Dense(128, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))
 model.compile(loss='binary_crossentropy', optimizer='rmsprop')
@ -105,59 +104,73 @@ model.fit(X_train, Y_train, batch_size=16, nb_epoch=10)
 score = model.evaluate(X_test, Y_test, batch_size=16)
 ```
---
+### Architecture for learning image captions with a convnet and a Gated Recurrent Unit:
 (word-level embedding, caption of maximum length 16 words).
-### Image captioning
+Note that getting this to work well will require using a bigger convnet, initialized with pre-trained weights.
 Architecture for learning image captions with a convnet and a Gated Recurrent Unit (word-level embedding, caption of maximum length 16 words).
 Note that getting this to actually "work" will require using a bigger convnet, initialized with pre-trained weights.
 Displaying readable results will also require an embedding decoder.
 ```python
 max_caption_len = 16
 vocab_size = 10000
-model = Sequential()
+# first, let's define an image model that
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
+# will encode pictures into 128-dimensional vectors.
-model.add(Activation('relu'))
+# it should be initialized with pre-trained weights.
-model.add(Convolution2D(32, 32, 3, 3))
+image_model = Sequential()
-model.add(Activation('relu'))
+image_model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100))) 
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(Activation('relu'))
 image_model.add(Convolution2D(32, 3, 3))
 image_model.add(Activation('relu'))
 image_model.add(MaxPooling2D(poolsize=(2, 2)))
-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+image_model.add(Convolution2D(64, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
+image_model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
+image_model.add(Convolution2D(64, 3, 3)) 
-model.add(Activation('relu'))
+image_model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(MaxPooling2D(poolsize=(2, 2)))
-model.add(Convolution2D(128, 64, 3, 3, border_mode='full')) 
+image_model.add(Flatten())
-model.add(Activation('relu'))
+image_model.add(Dense(128))
 model.add(Convolution2D(128, 128, 3, 3)) 
 model.add(Activation('relu'))
 model.add(MaxPooling2D(poolsize=(2, 2)))
-model.add(Flatten())
+# let's load the weights from a save file.
-model.add(Dense(128*4*4, 256))
+image_model.load_weights('weight_file.h5')
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-model.add(RepeatVector(max_caption_len)) 
+# next, let's define a RNN model that encodes sequences of words
-# the GRU below returns sequences of max_caption_len vectors of size 256 (our word embedding size)
+# into sequences of 128-dimensional word vectors.
-model.add(GRU(256, 256, return_sequences=True))
+language_model = Sequential()
 language_model.add(Embedding(vocab_size, 256, input_length=max_caption_len))
 language_model.add(GRU(output_dim=128, return_sequences=True))
 language_model.add(Dense(128))
-model.compile(loss='mean_squared_error', optimizer='rmsprop')
+# let's repeat the image vector to turn it into a sequence.
 image_model.add(RepeatVector(max_caption_len)) 
-# "images" is a numpy array of shape (nb_samples, nb_channels=3, width, height) 
+# the output of both models will be tensors of shape (samples, max_caption_len, 128).
-# "captions" is a numpy array of shape (nb_samples, max_caption_len=16, embedding_dim=256)
+# let's concatenate these 2 vector sequences.
-# captions are supposed already embedded (dense vectors).
+model = Merge([image_model, language_model], mode='concat', concat_axis=-1)
-model.fit(images, captions, batch_size=16, nb_epoch=100)
+# let's encode this vector sequence into a single vector
 model.add(GRU(256, 256, return_sequences=False))
 # which will be used to compute a probability 
 # distribution over what the next word in the caption should be!
 model.add(Dense(vocab_size))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 # "images" is a numpy float array of shape (nb_samples, nb_channels=3, width, height).
 # "captions" is a numpy integer array of shape (nb_samples, max_caption_len)
 # containing word index sequences representing partial captions.
 # "next_words" is a numpy float array of shape (nb_samples, vocab_size)
 # containing a categorical encoding (0s and 1s) of the next word in the corresponding
 # partial caption.
 model.fit([images, partial_captions], next_words, batch_size=16, nb_epoch=100)
 ```
---
+In the examples folder, you will find example models for real datasets:
-
+- CIFAR10 small images classification: Convolutional Neural Network (CNN) with realtime data augmentation
 In the [examples folder](https://github.com/fchollet/keras/tree/master/examples), you will find example models for real datasets:
 - CIFAR10 small images classification: Convnet with realtime data augmentation
 - IMDB movie review sentiment classification: LSTM over sequences of words
- Reuters newswires topic classification: Multilayer Perceptron
+- Reuters newswires topic classification: Multilayer Perceptron (MLP)
 - MNIST handwritten digits classification: MLP & CNN
 - Character-level text generation with LSTM
 ...and more.
--- a/docs/sources/index.md
+++ b/docs/sources/index.md
@ -46,9 +46,9 @@ Stacking layers is as easy as `.add()`:
 ```python
 from keras.layers.core import Dense, Activation
-model.add(Dense(input_dim=100, output_dim=64, init="glorot_uniform"))
+model.add(Dense(output_dim=64, input_dim=100, init="glorot_uniform"))
 model.add(Activation("relu"))
-model.add(Dense(input_dim=64, output_dim=10, init="glorot_uniform"))
+model.add(Dense(output_dim=10, init="glorot_uniform"))
 model.add(Activation("softmax"))
 ```
--- a/docs/sources/initializations.md
+++ b/docs/sources/initializations.md
@ -6,7 +6,7 @@ Initializations define the probability distribution used to set the initial rand
 The keyword arguments used for passing initializations to layers will depend on the layer. Usually it is simply `init`:
 ```python
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64, init='uniform'))
 ```
 ## Available initializations
--- a/docs/sources/layers/advanced_activations.md
+++ b/docs/sources/layers/advanced_activations.md
@ -7,7 +7,8 @@ keras.layers.advanced_activations.LeakyReLU(alpha=0.3)
 Special version of a Rectified Linear Unit that allows a small gradient when the unit is not active (`f(x) = alpha*x for x < 0`).
- __Input shape__: This layer does not assume a specific input shape. As a result, it cannot be used as the first layer in a model.
+
 - __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
@ -19,18 +20,16 @@ Special version of a Rectified Linear Unit that allows a small gradient when the
 ## PReLU
 ```python
-keras.layers.advanced_activations.PReLU(input_shape)
+keras.layers.advanced_activations.PReLU()
 ```
 Parametrized linear unit. Similar to a LeakyReLU, where each input unit has its alpha coefficient, and where these coefficients are learned during training.
- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+
 - __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
 - __Arguments__:
    - __input_shape__: tuple.
 - __References__:
    - [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](http://arxiv.org/pdf/1502.01852v1.pdf)
@ -39,18 +38,15 @@ Parametrized linear unit. Similar to a LeakyReLU, where each input unit has its
 ## ParametricSoftplus
 ```python
-keras.layers.advanced_activations.ParametricSoftplus(input_shape)
+keras.layers.advanced_activations.ParametricSoftplus()
 ```
 Parametric Softplus of the form: (`f(x) = alpha * (1 + exp(beta * x))`). This is essentially a smooth version of ReLU where the parameters control the sharpness of the rectification. The parameters are initialized to more closely approximate a ReLU than the standard `softplus`: `alpha` initialized to `0.2` and `beta`  initialized to `5.0`. The parameters are fit separately for each hidden unit.
- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape=...` when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
 - __Arguments__:
    - __input_shape__: tuple.
 - __References__:
    - [Inferring Nonlinear Neuronal Computation Based on Physiologically Plausible Inputs](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003143)
@ -62,7 +58,8 @@ keras.layers.advanced_activations.ThresholdedLinear(theta)
 Parametrized linear unit. provides a threshold near zero where values are zeroed.
- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+
 - __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
@ -80,7 +77,7 @@ keras.layers.advanced_activations.ThresholdedReLu(theta)
 Parametrized rectified linear unit. provides a threshold near zero where values are zeroed.
- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape=...` when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
--- a/docs/sources/layers/convolutional.md
+++ b/docs/sources/layers/convolutional.md
@ -2,22 +2,20 @@
 ## Convolution1D
 ```python
-keras.layers.convolutional.Convolution1D(input_dim, nb_filter, filter_length, 
+keras.layers.convolutional.Convolution1D(nb_filter, filter_length, 
        init='uniform', activation='linear', weights=None, 
        border_mode='valid', subsample_length=1, 
        W_regularizer=None, b_regularizer=None, W_constraint=None, 
-        b_constraint=None)
+        b_constraint=None, input_dim=None, input_length=None)
 ```
-Convolution operator for filtering neighborhoods of one-dimensional inputs.
+Convolution operator for filtering neighborhoods of one-dimensional inputs. When using this layer as the first layer in a model, either provide the keyword argument `input_dim` (int, e.g. 128 for sequences of 128-dimensional vectors), or `input_shape` (tuple of integers, e.g. (10, 128) for sequences of 10 vectors of 128-dimensional vectors).
 - __Input shape__: 3D tensor with shape: `(nb_samples, steps, input_dim)`.
 - __Output shape__: 3D tensor with shape: `(nb_samples, steps, nb_filter)`. `steps` value might have changed due to padding.
 - __Arguments__:
    - __input_dim__: Number of channels/dimensions in the input.
    - __nb_filter__: Number of convolution kernels to use (dimensionality of the output).
    - __filter_length__: The extension (spatial or temporal) of each filter.
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
@ -30,31 +28,32 @@ Convolution operator for filtering neighborhoods of one-dimensional inputs.
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
    - __input_dim__: Number of channels/dimensions in the input. Either this argument or the keyword argument `input_shape` must be provided when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 ---
 ## Convolution2D
 ```python
-keras.layers.convolutional.Convolution2D(nb_filter, stack_size, nb_row, nb_col, 
+keras.layers.convolutional.Convolution2D(nb_filter, nb_row, nb_col, 
        init='glorot_uniform', activation='linear', weights=None, 
        border_mode='valid', subsample=(1, 1),
        W_regularizer=None, b_regularizer=None, W_constraint=None)
 ```
-Convolution operator for filtering windows of two-dimensional inputs. 
+Convolution operator for filtering windows of two-dimensional inputs. When using this layer as the first layer in a model, provide the keyword argument `input_shape` (tuple of integers, does not include the sample axis), e.g. `input_shape=(3, 128, 128)` for 128x128 RGB pictures.
- __Input shape__: 4D tensor with shape: `(nb_samples, stack_size, nb_row, nb_col)`.
+- __Input shape__: 4D tensor with shape: `(nb_samples, channels, rows, cols)`.
- __Output shape__: 4D tensor with shape: `(nb_samples, nb_filter, nb_row, nb_col)`. `nb_row`, `nb_col` might have changed due to padding.
+- __Output shape__: 4D tensor with shape: `(nb_samples, nb_filter, rows, cols)`. `rows`, `cols` might have changed due to padding.
 - __Arguments__:
-    - __nb_filter__: Number of convolution kernels to use.
+    - __nb_filter__: Number of convolution filters to use.
-    - __stack_size__: Number of channels in the input.
+    - __nb_row__: Number of rows in the convolution kernel.
-    - __nb_row__: Number of rows in the convolution kernels
+    - __nb_col__: Number of columns in the convolution kernel.
    - __nb_col__: Number of columns in the convolution kernels
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
    - __activation__: name of activation function to use (see: [activations](../activations.md)), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
    - __weights__: list of numpy arrays to set as initial weights.
@ -90,7 +89,7 @@ keras.layers.convolutional.MaxPooling1D(pool_length=2, stride=None, ignore_borde
 ## MaxPooling2D
 ```python
-keras.layers.convolutional.MaxPooling2D(poolsize=(2, 2), ignore_border=True)
+keras.layers.convolutional.MaxPooling2D(pool_size=(2, 2), ignore_border=True)
 ```
 - __Input shape__: 4D tensor with shape: `(nb_samples, stack_size, nb_row, nb_col)`.
--- a/docs/sources/layers/core.md
+++ b/docs/sources/layers/core.md
@ -76,8 +76,9 @@ get_config()
 ## Dense
 ```python
-keras.layers.core.Dense(input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None \
+keras.layers.core.Dense(output_dim, init='glorot_uniform', activation='linear', weights=None
-W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None)
+W_regularizer=None, b_regularizer=None, activity_regularizer=None,
 W_constraint=None, b_constraint=None, input_dim=None)
 ```
 Standard 1D fully-connect layer. 
@ -88,7 +89,6 @@ Standard 1D fully-connect layer.
 - __Arguments__:
    - __input_dim__: int >= 0. 
    - __output_dim__: int >= 0. 
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
    - __activation__: name of activation function to use (see: [activations](../activations.md)), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
@ -98,21 +98,22 @@ Standard 1D fully-connect layer.
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model. 
 ---
 ## TimeDistributedDense
 ```python
-keras.layers.core.TimeDistributedDense(input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None \
+keras.layers.core.TimeDistributedDense(output_dim, init='glorot_uniform', activation='linear', weights=None
-W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None)
+W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None,
 input_dim=None, input_length=None)
 ```
 Fully-connected layer distributed over the time dimension. Useful after a recurrent network set to `return_sequences=True`.
- __Input shape__: 3D tensor with shape: `(nb_samples, nb_timesteps, input_dim)`.
+- __Input shape__: 3D tensor with shape: `(nb_samples, timesteps, input_dim)`.
 - __Arguments__:
    - __input_dim__: int >= 0. 
    - __output_dim__: int >= 0. 
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
    - __activation__: name of activation function to use (see: [activations](../activations.md)), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
@ -122,12 +123,14 @@ Fully-connected layer distributed over the time dimension. Useful after a recurr
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 - __Example__:
 ```python
-# input shape: (nb_samples, nb_timesteps, 10)
+# input shape: (nb_samples, timesteps, 10)
-model.add(LSTM(10, 5, return_sequences=True)) # output shape: (nb_samples, nb_timesteps, 5)
+model.add(LSTM(5, return_sequences=True, input_dim=10)) # output shape: (nb_samples, timesteps, 5)
-model.add(TimeDistributedDense(5, 10)) # output shape: (nb_samples, nb_timesteps, 10)
+model.add(TimeDistributedDense(15)) # output shape: (nb_samples, timesteps, 15)
 ```
@ -160,8 +163,8 @@ A customizable autoencoder model. If `output_reconstruction = True` then dim(inp
 from keras.layers import containers
 # input shape: (nb_samples, 32)
-encoder = containers.Sequential([Dense(32, 16), Dense(16, 8)])
+encoder = containers.Sequential([Dense(16, input_dim=32), Dense(8)])
-decoder = containers.Sequential([Dense(8, 16), Dense(16, 32)])
+decoder = containers.Sequential([Dense(16, input_dim=8), Dense(32)])
 autoencoder = Sequential()
 autoencoder.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=False))
@ -176,7 +179,8 @@ keras.layers.core.Activation(activation)
 ```
 Apply an activation function to the input. 
- __Input shape__: This layer does not assume a specific input shape. As a result, it cannot be used as the first layer in a model.
+
 - __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
@ -193,7 +197,8 @@ keras.layers.core.Dropout(p)
 ```
 Apply dropout to the input. Dropout consists in randomly setting a fraction `p` of input units to 0 at each update during training time, which helps prevent overfitting. Reference: [Dropout: A Simple Way to Prevent Neural Networks from Overfitting](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
- __Input shape__: This layer does not assume a specific input shape. 
+
 - __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
@ -206,24 +211,25 @@ Apply dropout to the input. Dropout consists in randomly setting a fraction `p`
 ## Reshape
 ```python
-keras.layers.core.Reshape(*dims)
+keras.layers.core.Reshape(dims)
 ```
 Reshape the input to a new shape containing the same number of units. 
 - __Input shape__: This layer does not assume a specific input shape. 
- __Output shape__: `(nb_samples, *dims)`.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: `(nb_samples, dims)`.
 - __Arguments__:
-    - *dims: integers. Dimensions of the new shape.
+    - dims: tuple of integers. Dimensions of the new shape.
 - __Example__:
 ```python
 # input shape: (nb_samples, 10)
-model.add(Dense(10, 100)) # output shape: (nb_samples, 100)
+model.add(Dense(100, input_dim=10)) # output shape: (nb_samples, 100)
-model.add(Reshape(10, 10))  # output shape: (nb_samples, 10, 10)
+model.add(Reshape(dims=(10, 10)))  # output shape: (nb_samples, 10, 10)
 ```
 ---
@ -235,7 +241,7 @@ keras.layers.core.Flatten()
 Convert a nD input to 1D. 
- __Input shape__: (nb_samples, *). This layer cannot be used as the first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: `(nb_samples, nb_input_units)`.
@ -250,7 +256,7 @@ Repeat the 1D input n times. Dimensions of input are assumed to be `(nb_samples,
 Note that the output is still a single tensor; `RepeatVector` does not split the data flow.
- __Input shape__: This layer does not assume a specific input shape. This layer cannot be used as the first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: `(nb_samples, n, input_dims)`.
@ -265,7 +271,7 @@ keras.layers.core.Permute(dims)
 ```
 Permute the dimensions of the input data according to the given tuple. Sometimes useful for connecting RNNs and convnets together.
- __Input shape__: This layer does not assume a specific input shape.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as the input shape, but with the dimensions re-ordered according to the ordering specified by the tuple.
@ -274,9 +280,9 @@ Permute the dimensions of the input data according to the given tuple. Sometimes
 - __Example__:
 ```python
 # input shape: (nb_samples, 10)
-model.add(Dense(10, 50)) # output shape: (nb_samples, 50)
+model.add(Dense(50, input_dim=10)) # output shape: (nb_samples, 50)
-model.add(Reshape(10, 5)) # output shape: (nb_samples, 10, 5)
+model.add(Reshape(dims=(10, 5))) # output shape: (nb_samples, 10, 5)
-model.add(Permute((2, 1))) #output shape: (nb_samples, 5, 10)
+model.add(Permute(dims=(2, 1))) #output shape: (nb_samples, 5, 10)
 ```
 ---
@ -294,8 +300,9 @@ This layer can be used, for instance, to induce activation sparsity in the previ
 ## MaxoutDense
 ```python
-keras.layers.core.MaxoutDense(input_dim, output_dim, nb_feature=4, init='glorot_uniform', weights=None, \
+keras.layers.core.MaxoutDense(output_dim, nb_feature=4, init='glorot_uniform', weights=None,
-        W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None)
+        W_regularizer=None, b_regularizer=None, activity_regularizer=None,
        W_constraint=None, b_constraint=None, input_dim=None)
 ```
 A dense maxout layer. A `MaxoutDense` layer takes the element-wise maximum of `nb_feature` `Dense(input_dim, output_dim)` linear layers. This allows the layer to learn a convex, piecewise linear activation function over the inputs. See [this paper](http://arxiv.org/pdf/1302.4389.pdf) for more details. Note that this is a *linear* layer -- if you wish to apply activation function (you shouldn't need to -- they are universal function approximators), an `Activation` layer must be added after.
@ -306,7 +313,6 @@ A dense maxout layer. A `MaxoutDense` layer takes the element-wise maximum of `n
 - __Arguments__:
    - __input_dim__: int >= 0. 
    - __output_dim__: int >= 0. 
    - __nb_feature__: int >= 0. the number of features to create for the maxout. This is equivalent to the number of piecewise elements to be allowed for the activation function. 
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
@ -316,12 +322,12 @@ A dense maxout layer. A `MaxoutDense` layer takes the element-wise maximum of `n
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
 ```python
 # input shape: (nb_samples, 10)
-model.add(Dense(10, 100)) # output shape: (nb_samples, 100)
+model.add(Dense(100, input_dim=10)) # output shape: (nb_samples, 100)
-model.add(MaxoutDense(100, 100, nb_feature=10)) # output shape: (nb_samples, 100)
+model.add(MaxoutDense(50, nb_feature=10)) # output shape: (nb_samples, 50)
 model.add(RepeatVector(2))  # output shape: (nb_samples, 2, 10)
 ```
 ## Merge
@ -339,17 +345,17 @@ Merge the output of a list of layers (or containers) into a single tensor, follo
 ```python
 left = Sequential()
-left.add(Dense(784, 50))
+left.add(Dense(50, input_shape=(784,)))
 left.add(Activation('relu'))
 right = Sequential()
-right.add(Dense(784, 50))
+right.add(Dense(50, input_shape=(784,)))
 right.add(Activation('relu'))
 model = Sequential()
 model.add(Merge([left, right], mode='sum'))
-model.add(Dense(50, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
--- a/docs/sources/layers/embeddings.md
+++ b/docs/sources/layers/embeddings.md
@ -2,15 +2,15 @@
 ## Embedding
 ```python
-keras.layers.embeddings.Embedding(input_dim, output_dim, init='uniform', weights=None, W_regularizer=None, W_constraint=None, mask_zero=False)
+keras.layers.embeddings.Embedding(input_dim, output_dim, init='uniform', weights=None, W_regularizer=None, W_constraint=None, mask_zero=False, max_length=None)
 ```
 Turn positive integers (indexes) into denses vectors of fixed size,
 eg. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
- __Input shape__: 2D tensor with shape: `(nb_samples, maxlen)`.
+- __Input shape__: 2D tensor with shape: `(nb_samples, sequence_length)`.
- __Output shape__: 3D tensor with shape: `(nb_samples, maxlen, output_dim)`.
+- __Output shape__: 3D tensor with shape: `(nb_samples, sequence_length, output_dim)`.
 - __Arguments__:
@ -21,6 +21,7 @@ eg. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
    - __W_regularizer__: instance of the [regularizers](../regularizers.md) module (eg. L1 or L2 regularization), applied to the embedding matrix.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the embedding matrix.
 	- __mask_zero__: Whether or not the input value 0 is a special "padding" value that should be masked out. This is useful for [recurrent layers](recurrent.md) which may take variable length input. If this is `True` then all subsequent layers in the model need to support masking or an exception will be raised.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 ## WordContextProduct
--- a/docs/sources/layers/noise.md
+++ b/docs/sources/layers/noise.md
@ -6,9 +6,9 @@ keras.layers.noise.GaussianNoise(sigma)
 ```
 Apply to the input an additive zero-centred gaussian noise with standard deviation `sigma`. This is useful to mitigate overfitting (you could see it as a kind of random data augmentation). Gaussian Noise (GS) is a natural choice as corruption process for real valued inputs.
-The Gaussian noise is only added at training time.
+Only active at training time.
- __Input shape__: This layer does not assume a specific input shape. 
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
@ -24,11 +24,9 @@ keras.layers.noise.GaussianDropout(p)
 ```
 Apply to the input an multiplicative one-centred gaussian noise with standard deviation `sqrt(p/(1-p))`. p refers to drop probability to match Dropout layer syntax. 
-http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
+Only active at training time.
-The Gaussian noise is only used at training time.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Input shape__: This layer does not assume a specific input shape. 
 - __Output shape__: Same as input.
@ -36,3 +34,4 @@ The Gaussian noise is only used at training time.
    - __p__: float, drop probability as with Dropout.
--- a/docs/sources/layers/normalization.md
+++ b/docs/sources/layers/normalization.md
@ -2,17 +2,16 @@
 ## BatchNormalization
 ```python
-keras.layers.normalization.BatchNormalization(input_shape, epsilon=1e-6, weights=None)
+keras.layers.normalization.BatchNormalization(epsilon=1e-6, weights=None)
 ```
 Normalize the activations of the previous layer at each batch.
- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
 - __Output shape__: Same as input.
 - __Arguments__: 
    - __input_shape__: tuple.
    - __epsilon__: small float > 0. Fuzz parameter.
    - __weights__: Initialization weights. List of 2 numpy arrays, with shapes: `[(input_shape,), (input_shape,)]`
--- a/docs/sources/layers/recurrent.md
+++ b/docs/sources/layers/recurrent.md
@ -2,9 +2,9 @@
 ## SimpleRNN
 ```python
-keras.layers.recurrent.SimpleRNN(input_dim, output_dim, 
+keras.layers.recurrent.SimpleRNN(output_dim, 
        init='glorot_uniform', inner_init='orthogonal', activation='sigmoid', weights=None,
-        truncate_gradient=-1, return_sequences=False)
+        truncate_gradient=-1, return_sequences=False, input_dim=None, input_length=None)
 ```
 Fully connected RNN where output is to fed back to input. 
@ -18,23 +18,25 @@ Fully connected RNN where output is to fed back to input.
 - __Arguments__:
    - __input_dim__: dimension of the input.
    - __output_dim__: dimension of the internal projections and the final output.
    - __init__: weight initialization function. Can be the name of an existing function (str), or a Theano function (see: [initializations](../initializations.md)).
    - __activation__: activation function. Can be the name of an existing function (str), or a Theano function (see: [activations](../activations.md)).
    - __weights__: list of numpy arrays to set as initial weights. The list should have 3 elements, of shapes: `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 ---
 ## SimpleDeepRNN
 ```python
-keras.layers.recurrent.SimpleDeepRNN(input_dim, output_dim, depth=3,
+keras.layers.recurrent.SimpleDeepRNN(output_dim, depth=3,
        init='glorot_uniform', inner_init='orthogonal', 
        activation='sigmoid', inner_activation='hard_sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
        input_dim=None, input_length=None)
 ```
 Fully connected RNN where the output of multiple timesteps (up to "depth" steps in the past) is fed back to the input: 
@ -64,6 +66,8 @@ Not a particularly useful model, included for demonstration purposes.
    - __weights__: list of numpy arrays to set as initial weights. The list should have depth+2 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 ---
@ -74,7 +78,8 @@ Not a particularly useful model, included for demonstration purposes.
 keras.layers.recurrent.GRU(input_dim, output_dim=128, 
        init='glorot_uniform', inner_init='orthogonal',
        activation='sigmoid', inner_activation='hard_sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
        input_dim=None, input_length=None)
 ```
 Gated Recurrent Unit - Cho et al. 2014.
@ -97,6 +102,8 @@ Gated Recurrent Unit - Cho et al. 2014.
    - __weights__: list of numpy arrays to set as initial weights. The list should have 9 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 - __References__: 
    - [On the Properties of Neural Machine Translation: Encoder–Decoder Approaches](http://www.aclweb.org/anthology/W14-4012)
@ -110,7 +117,8 @@ Gated Recurrent Unit - Cho et al. 2014.
 keras.layers.recurrent.LSTM(input_dim, output_dim=128, 
        init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one',
        activation='tanh', inner_activation='hard_sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
        input_dim=None, input_length=None)
 ```
 Long-Short Term Memory unit - Hochreiter 1997.
@ -134,6 +142,8 @@ Long-Short Term Memory unit - Hochreiter 1997.
    - __weights__: list of numpy arrays to set as initial weights. The list should have 12 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 - __References__: 
    - [Long short-term memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf) (original 1997 paper)
@ -148,7 +158,8 @@ Long-Short Term Memory unit - Hochreiter 1997.
 keras.layers.recurrent.JZS1(input_dim, output_dim=128, 
        init='glorot_uniform', inner_init='orthogonal', 
        activation='tanh', inner_activation='sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
        input_dim=None, input_length=None)
 ```
 Top 3 RNN architectures evolved from the evaluation of thousands of models. Serves as alternatives to LSTMs and GRUs. Corresponds to `MUT1`, `MUT2`, and `MUT3` architectures described in the paper: An Empirical Exploration of Recurrent Network Architectures, Jozefowicz et al. 2015.
@ -171,6 +182,8 @@ Top 3 RNN architectures evolved from the evaluation of thousands of models. Serv
    - __weights__: list of numpy arrays to set as initial weights. The list should have 9 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).
 - __References__: 
    - [An Empirical Exploration of Recurrent Network Architectures](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
--- a/docs/sources/models.md
+++ b/docs/sources/models.md
@ -52,7 +52,7 @@ from keras.layers.core import Dense, Dropout, Activation
 from keras.optimizers import SGD
 model = Sequential()
-model.add(Dense(64, 2, init='uniform'))
+model.add(Dense(2, init='uniform', input_dim=64))
 model.add(Activation('softmax'))
 model.compile(loss='mse', optimizer='sgd')
@ -125,10 +125,10 @@ Arbitrary connection graph. It can have any number of inputs and outputs, with e
 model = keras.models.Graph()
 ```
 - __Methods__:
-    - __add_input__(name, ndim=2, dtype='float'): Add an input with shape dimensionality `ndim`. 
+    - __add_input__(name, input_shape, dtype='float'): Add an input with shape dimensionality `ndim`. 
        - __Arguments__:
-            - __ndim__: Use `ndim=2` for vector input `(samples, features)`, ndim=3 for temporal input `(samples, time, features)`, ndim=4 for image input `(samples, channels, height, width)`.
+            - __input_shape__: Integer tuple, shape of the expected input (not including the samples axis). E.g. (10,) for 10-dimensional vectors, (None, 128) for sequences (of variable length) of 128-dimensional vectors, (3, 32, 32) for 32x32 images with RGB channels.
-            - __dtype__: `float` or `int`. Use `int` if the input is connected to an Embedding layer, `float` otherwise.
+            - __dtype__: `float` or `int`. Type of the expected input data.
    - __add_output__(name, input=None, inputs=[], merge_mode='concat'): Add an output connect to `input` or `inputs`.
        - __Arguments__:
            - __name__: str. unique identifier of the output.
@ -176,10 +176,10 @@ __Examples__:
 ```python
 # graph model with one input and two outputs
 graph = Graph()
-graph.add_input(name='input', ndim=2)
+graph.add_input(name='input', input_shape=(32,))
-graph.add_node(Dense(32, 16), name='dense1', input='input')
+graph.add_node(Dense(16), name='dense1', input='input')
-graph.add_node(Dense(32, 4), name='dense2', input='input')
+graph.add_node(Dense(4), name='dense2', input='input')
-graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+graph.add_node(Dense(4), name='dense3', input='dense1')
 graph.add_output(name='output1', input='dense2')
 graph.add_output(name='output2', input='dense3')
@ -191,11 +191,11 @@ history = graph.fit({'input':X_train, 'output1':y_train, 'output2':y2_train}, nb
 ```python
 # graph model with two inputs and one output
 graph = Graph()
-graph.add_input(name='input1', ndim=2)
+graph.add_input(name='input1', input_shape=(32,))
-graph.add_input(name='input2', ndim=2)
+graph.add_input(name='input2', input_shape=(32,))
-graph.add_node(Dense(32, 16), name='dense1', input='input1')
+graph.add_node(Dense(16), name='dense1', input='input1')
-graph.add_node(Dense(32, 4), name='dense2', input='input2')
+graph.add_node(Dense(4), name='dense2', input='input2')
-graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+graph.add_node(Dense(4), name='dense3', input='dense1')
 graph.add_output(name='output', inputs=['dense2', 'dense3'], merge_mode='sum')
 graph.compile('rmsprop', {'output':'mse'})
--- a/docs/sources/optimizers.md
+++ b/docs/sources/optimizers.md
@ -5,7 +5,7 @@ An optimizer is one of the two arguments required for compiling a Keras model:
 ```python
 model = Sequential()
-model.add(Dense(20, 64, init='uniform'))
+model.add(Dense(64, init='uniform', input_dim=10))
 model.add(Activation('tanh'))
 model.add(Activation('softmax'))
--- a/docs/sources/regularizers.md
+++ b/docs/sources/regularizers.md
@ -15,7 +15,7 @@ These layers expose 3 keyword arguments:
 ```python
 from keras.regularizers import l2, activity_l2
-model.add(Dense(64, 64, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
+model.add(Dense(64, input_dim=64, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
 ```
 ## Available penalties