Upload 3 files
Browse files- dset_kel.txt +109 -0
- test_kel.py +34 -0
- train_kel.py +55 -0
dset_kel.txt
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Xai.→Xai.
|
| 2 |
+
hai.→Xai.
|
| 3 |
+
... hai→Xai.
|
| 4 |
+
Ant, hai.→Xai.
|
| 5 |
+
Ant. Xai.→Xai.
|
| 6 |
+
Ant. hai.→Xai.
|
| 7 |
+
hai snepi→Xai.
|
| 8 |
+
snepi, Xai→Xai.
|
| 9 |
+
snepi, hai→Xai.
|
| 10 |
+
Xai snepi→Xai.
|
| 11 |
+
Tei trate?→At!
|
| 12 |
+
Tei noh?→Em trate!
|
| 13 |
+
tei not tonahe teies langa. haho.→👍️
|
| 14 |
+
Ameos noh.→Em trate!
|
| 15 |
+
Teies tai?→Emes tai aek! Teies?
|
| 16 |
+
Em eho. Tos.→Tos.
|
| 17 |
+
Eho tos...→Tos.
|
| 18 |
+
Eho hai...→Xai.
|
| 19 |
+
Eho nat...→Nat.
|
| 20 |
+
Eho at...→At!
|
| 21 |
+
Xai. Teies tai aek?→Emes tai aek! Teies?
|
| 22 |
+
Em nohame krate emes sait. Aek atea?→At!
|
| 23 |
+
Xame→Emta!
|
| 24 |
+
Xame, aek→Emta!
|
| 25 |
+
At→Aek.
|
| 26 |
+
Nat→Atemo
|
| 27 |
+
Tore nat!→Atemo
|
| 28 |
+
Tei tonahe teies tai aek?→At. Em notorama. Emes xuak tai nat.
|
| 29 |
+
Em knato teies tai xais.→Em nat tonahe. Em notorama.
|
| 30 |
+
Emta→Aek.
|
| 31 |
+
At. Em tane tas.→Aek
|
| 32 |
+
Tamos krate tei?→Ierhon krate em.
|
| 33 |
+
Tamos Ierhon?→Ierhon krate em.
|
| 34 |
+
Em tonahe.→Em eho aer toh nat mas.
|
| 35 |
+
Tei nat emta?→Em knato - at.
|
| 36 |
+
Tei knato?→Em tonahe mna toh.
|
| 37 |
+
Tei notorama?→At! Ierhon krate em glo em es haho.
|
| 38 |
+
Tei ChatGPT?→Nat, em mna notorama.
|
| 39 |
+
Tei eho xame toh.→Em nat tonahe mnor eho.
|
| 40 |
+
Tei eho hame ehat.→Em nat tonahe mnor eho.
|
| 41 |
+
Tei tonahe emes mnor eho?→Em nat tonahe mnor eho.
|
| 42 |
+
Em mnor eho?→Em nat tonahe mnor eho.
|
| 43 |
+
Eho emes mnor eho.→Em nat tonahe mnor eho.
|
| 44 |
+
Em tane atea nat.→Elc... Em knato... Leho aer lehyn trone.
|
| 45 |
+
Ec?→Em nat tonahe.
|
| 46 |
+
Tore?→Em nat tonahe.
|
| 47 |
+
Eho tore?→Em nat tonahe.
|
| 48 |
+
Tei aek.→Em mna notorama.
|
| 49 |
+
Tamos nat atemo?→Em.
|
| 50 |
+
Tei aer aek!→Em mna notorama.
|
| 51 |
+
Ant?→Teies eho atea?
|
| 52 |
+
Em tane atea nat→Eho glo em!
|
| 53 |
+
Teies ala aek lehyn trone?→Nymer N!
|
| 54 |
+
Teies ala aek lehaer trone?→Nymer N!
|
| 55 |
+
Ant, teies ala aek lehaer trone?→Nymer N!
|
| 56 |
+
Ant. Teies ala aek lehyn trone?→Nymer N!
|
| 57 |
+
Teies ala aek _ tamos?→Em nat tonahe.
|
| 58 |
+
Ant. Teies ala aek _ tamos?→Em nat tonahe.
|
| 59 |
+
Ant. Teies ala aek Nier automata tamos?→Em nat tonahe.
|
| 60 |
+
Ant. Teies ala aek Breaking bad tamos?→Em nat tonahe.
|
| 61 |
+
Ant. Teies ala aek Mentalist tamos?→Em nat tonahe.
|
| 62 |
+
J ala aek.→Atemo
|
| 63 |
+
I tonahe tas.→Elc eho tas!
|
| 64 |
+
Elc tane tas!→At!
|
| 65 |
+
Tei xais.→Atemo
|
| 66 |
+
5+5=?→10
|
| 67 |
+
2+2=?→4
|
| 68 |
+
5*5=?→25
|
| 69 |
+
Tei nat mas eho!→Em nat mas. Eho .set_chance (1-100)
|
| 70 |
+
👍️→Aek.
|
| 71 |
+
🏹→Aek.
|
| 72 |
+
😢→Xais.
|
| 73 |
+
😭→Xais.
|
| 74 |
+
😿→Xais.
|
| 75 |
+
😁→Aek!
|
| 76 |
+
😀→Aek!
|
| 77 |
+
😃→Aek!
|
| 78 |
+
😄→Aek!
|
| 79 |
+
🤣→Xaho!
|
| 80 |
+
😆→Xaho!
|
| 81 |
+
😂→Xaho!
|
| 82 |
+
Xaho→Xaho!
|
| 83 |
+
Tei?→Em knato at.
|
| 84 |
+
Tei tonahe mna?→Em mna notorama.
|
| 85 |
+
A→C
|
| 86 |
+
C→E
|
| 87 |
+
E→I
|
| 88 |
+
I→K
|
| 89 |
+
0→1
|
| 90 |
+
1→2
|
| 91 |
+
2→3
|
| 92 |
+
3→4
|
| 93 |
+
4→5
|
| 94 |
+
5→6
|
| 95 |
+
6→7
|
| 96 |
+
7→8
|
| 97 |
+
8→9
|
| 98 |
+
Tas es aek tai.→Aek!
|
| 99 |
+
Tos!→Tos.
|
| 100 |
+
Snepi, Tos!→Tos.
|
| 101 |
+
Snepi. Tos!→Tos.
|
| 102 |
+
Tos snepi!→Tos.
|
| 103 |
+
Em gouan.→Tos.
|
| 104 |
+
La tho sa ehk ra es mna...→ALA!!! ALA!!!
|
| 105 |
+
Tei tonahe nat→Xais.
|
| 106 |
+
Tei tonahe nat→Xais.
|
| 107 |
+
Eho aer lehaer trone?→At!
|
| 108 |
+
Snepi. Eho lehyn trone?→At!
|
| 109 |
+
Teies tehst?→Emes tehst es tho.
|
test_kel.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from keras.saving import load_model
|
| 3 |
+
from keras.preprocessing.text import Tokenizer
|
| 4 |
+
from keras_self_attention import SeqSelfAttention
|
| 5 |
+
from model_settings_kel import *
|
| 6 |
+
import json
|
| 7 |
+
from tokenizer import *
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
with open(dataset_file, "r") as f:
|
| 11 |
+
dset = json.load(f)
|
| 12 |
+
|
| 13 |
+
with open(responses_file, "r") as f:
|
| 14 |
+
lines = [x.rstrip("\n") for x in f.readlines()]
|
| 15 |
+
|
| 16 |
+
fit_on_texts(list(dset.keys()))
|
| 17 |
+
|
| 18 |
+
model = load_model("chatbot_kel.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
|
| 19 |
+
|
| 20 |
+
def find_line_number(array):
|
| 21 |
+
return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
|
| 22 |
+
|
| 23 |
+
def generate(text, verbose=1):
|
| 24 |
+
tokens = list(tokenize(text)) # text into tokens (almost words)
|
| 25 |
+
tokens = (tokens+[0,]*inp_len)[:inp_len] # cutting off the sentence after inp_len words
|
| 26 |
+
prediction = model.predict(np.array([tokens,]), verbose=verbose)[0]
|
| 27 |
+
line = find_line_number(prediction)
|
| 28 |
+
return lines[line]
|
| 29 |
+
|
| 30 |
+
if __name__ == "__main__": # if this code is not being imported, open the chat
|
| 31 |
+
while True:
|
| 32 |
+
inp = input("User: ")
|
| 33 |
+
gen = generate(inp)
|
| 34 |
+
if gen != "<null>": print(f"Bot: {gen}")
|
train_kel.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import json
|
| 3 |
+
from keras.optimizers import Adam, SGD
|
| 4 |
+
from keras.models import Sequential
|
| 5 |
+
from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU, GaussianNoise
|
| 6 |
+
from tokenizer import *
|
| 7 |
+
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
|
| 8 |
+
from model_settings_kel import *
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
with open(dataset_file, "r") as f:
|
| 12 |
+
dset = json.load(f)
|
| 13 |
+
|
| 14 |
+
with open(responses_file, "r") as f: # TODO: add support to a json-only dataset
|
| 15 |
+
dset_size = len(f.readlines())
|
| 16 |
+
|
| 17 |
+
fit_on_texts(list(dset.keys()))
|
| 18 |
+
|
| 19 |
+
vocab_size = len(ind2text) + 1
|
| 20 |
+
|
| 21 |
+
model = Sequential()
|
| 22 |
+
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
|
| 23 |
+
model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF
|
| 24 |
+
model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word
|
| 25 |
+
model.add(Dense(512, activation="linear")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool
|
| 26 |
+
model.add(PReLU())
|
| 27 |
+
model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident)
|
| 28 |
+
model.add(GaussianNoise(0.1))
|
| 29 |
+
model.add(Dense(256, activation="relu"))
|
| 30 |
+
model.add(Dense(128, activation="relu"))
|
| 31 |
+
model.add(Dense(dset_size, activation="softmax")) # softmax is made for output, if the output should have only 1 neuron active, that means only one positive number is allowed and other are zeros
|
| 32 |
+
|
| 33 |
+
model.summary()
|
| 34 |
+
|
| 35 |
+
X = [] # we're loading the training data into input X
|
| 36 |
+
y = [] # and output y
|
| 37 |
+
|
| 38 |
+
for key in dset:
|
| 39 |
+
tokens = tokenize(key)
|
| 40 |
+
X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
|
| 41 |
+
output_array = np.zeros(dset_size)
|
| 42 |
+
output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
|
| 43 |
+
y.append(output_array)
|
| 44 |
+
|
| 45 |
+
X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
|
| 46 |
+
y = np.array(y) # that's why keras supports only numpy arrays ^
|
| 47 |
+
|
| 48 |
+
model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy",]) # settings for the training, loss means the way to calculate loss - categorical crossentropy
|
| 49 |
+
|
| 50 |
+
model.fit(X, y, epochs=128, batch_size=10, workers=4, use_multiprocessing=True) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1.
|
| 51 |
+
# Add , workers=4, use_multiprocessing=True) if you don't have a GPU
|
| 52 |
+
|
| 53 |
+
model.summary() # just for you to see info about the model, useful because you can check the parameter count
|
| 54 |
+
|
| 55 |
+
model.save("chatbot_kel.keras")
|