### Text Preprocessing

In [2]:
import sys
sys.path.insert(0, '..')

from mxnet import nd
import random

with open('data/timemachine.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = ' '.join(' '.join(lines).lower().split())

print('number of characters: ', len(raw_dataset))
print(raw_dataset[0:70])

number of characters:  178605
the time machine, by h. g. wells [1898] i the time traveller (for so i


### Character Index

In [3]:
idx_to_char = list(set(raw_dataset))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
print(char_to_idx)

{',': 0, 's': 1, '1': 2, 'o': 3, 'r': 4, 'u': 5, 'j': 6, 'l': 7, 'w': 8, 'h': 9, '?': 10, '[': 11, 't': 12, 'k': 13, '"': 14, 'x': 15, '-': 16, ')': 17, '.': 18, 'e': 19, 'n': 20, 'f': 21, 'c': 22, 'g': 23, 'z': 24, ':': 25, ']': 26, 'a': 27, 'd': 28, 'm': 29, 'p': 30, '(': 31, ';': 32, 'b': 33, 'i': 34, ' ': 35, '8': 36, 'y': 37, 'q': 38, 'v': 39, "'": 40, '_': 41, '!': 42, '9': 43}


Converting it back to text

In [4]:
corpus_indices = [char_to_idx[char] for char in raw_dataset]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: the time machine, by
indices: [12, 9, 19, 35, 12, 34, 29, 19, 35, 29, 27, 22, 9, 34, 20, 19, 0, 35, 33, 37]


### Random Sampling

In [5]:
# This function is saved in the d2l package for future use.
def data_iter_random(corpus_indices, batch_size, num_steps, ctx=None):
    # offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    corpus_indices = corpus_indices[offset:]
    # subtract 1 extra since we need to account for the sequence length
    num_examples = ((len(corpus_indices) - 1) // num_steps) - 1
    # discard half empty batches
    num_batches = num_examples // batch_size
    example_indices = list(range(0, num_examples * num_steps, num_steps))
    random.shuffle(example_indices)
    
    # This returns a sequence of the length num_steps starting from pos.
    def _data(pos):
        return corpus_indices[pos: pos + num_steps]

    for i in range(0, batch_size * num_batches, batch_size):
        # batch_size indicates the random examples read each time.
        batch_indices = example_indices[i:(i+batch_size)]
        X = [_data(j) for j in batch_indices]
        Y = [_data(j + 1) for j in batch_indices]       

        yield nd.array(X, ctx), nd.array(Y, ctx)

### Example

Batch size 2 and time steps is 5 for a sequence of length 30.

In [8]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  
[[17. 18. 19. 20. 21.]
 [ 7.  8.  9. 10. 11.]]
<NDArray 2x5 @cpu(0)> 
Y: 
[[18. 19. 20. 21. 22.]
 [ 8.  9. 10. 11. 12.]]
<NDArray 2x5 @cpu(0)>
X:  
[[12. 13. 14. 15. 16.]
 [ 2.  3.  4.  5.  6.]]
<NDArray 2x5 @cpu(0)> 
Y: 
[[13. 14. 15. 16. 17.]
 [ 3.  4.  5.  6.  7.]]
<NDArray 2x5 @cpu(0)>


### Sequential partitioning

Adjacent positioning of minibatches. This way we can retain the latent state between batches. 

In [9]:
# This function is saved in the d2l package for future use.
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    # offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    # slice out data - ignore num_steps and just wrap around
    num_indices = ((len(corpus_indices) - offset) // batch_size) * batch_size
    indices = nd.array(corpus_indices[offset:(offset + num_indices)], ctx=ctx)
    indices = indices.reshape((batch_size,-1))
    # need to leave one last token since targets are shifted by 1
    num_epochs = ((num_indices // batch_size) - 1) // num_steps

    for i in range(0, num_epochs * num_steps, num_steps):
        X = indices[:,i:(i+num_steps)]
        Y = indices[:,(i+1):(i+1+num_steps)]
        yield X, Y

### Example partitioning

In [12]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y)

X:  
[[ 2.  3.  4.  5.  6.  7.]
 [16. 17. 18. 19. 20. 21.]]
<NDArray 2x6 @cpu(0)> 
Y: 
[[ 3.  4.  5.  6.  7.  8.]
 [17. 18. 19. 20. 21. 22.]]
<NDArray 2x6 @cpu(0)>
X:  
[[ 8.  9. 10. 11. 12. 13.]
 [22. 23. 24. 25. 26. 27.]]
<NDArray 2x6 @cpu(0)> 
Y: 
[[ 9. 10. 11. 12. 13. 14.]
 [23. 24. 25. 26. 27. 28.]]
<NDArray 2x6 @cpu(0)>
