Upload 3 files

Browse files

Files changed (3) hide show

dset_kel.txt +109 -0
test_kel.py +34 -0
train_kel.py +55 -0

dset_kel.txt ADDED Viewed

	@@ -0,0 +1,109 @@

+Xai.→Xai.
+hai.→Xai.
+... hai→Xai.
+Ant, hai.→Xai.
+Ant. Xai.→Xai.
+Ant. hai.→Xai.
+hai snepi→Xai.
+snepi, Xai→Xai.
+snepi, hai→Xai.
+Xai snepi→Xai.
+Tei trate?→At!
+Tei noh?→Em trate!
+tei not tonahe teies langa. haho.→👍️
+Ameos noh.→Em trate!
+Teies tai?→Emes tai aek! Teies?
+Em eho. Tos.→Tos.
+Eho tos...→Tos.
+Eho hai...→Xai.
+Eho nat...→Nat.
+Eho at...→At!
+Xai. Teies tai aek?→Emes tai aek! Teies?
+Em nohame krate emes sait. Aek atea?→At!
+Xame→Emta!
+Xame, aek→Emta!
+At→Aek.
+Nat→Atemo
+Tore nat!→Atemo
+Tei tonahe teies tai aek?→At. Em notorama. Emes xuak tai nat.
+Em knato teies tai xais.→Em nat tonahe. Em notorama.
+Emta→Aek.
+At. Em tane tas.→Aek
+Tamos krate tei?→Ierhon krate em.
+Tamos Ierhon?→Ierhon krate em.
+Em tonahe.→Em eho aer toh nat mas.
+Tei nat emta?→Em knato - at.
+Tei knato?→Em tonahe mna toh.
+Tei notorama?→At! Ierhon krate em glo em es haho.
+Tei ChatGPT?→Nat, em mna notorama.
+Tei eho xame toh.→Em nat tonahe mnor eho.
+Tei eho hame ehat.→Em nat tonahe mnor eho.
+Tei tonahe emes mnor eho?→Em nat tonahe mnor eho.
+Em mnor eho?→Em nat tonahe mnor eho.
+Eho emes mnor eho.→Em nat tonahe mnor eho.
+Em tane atea nat.→Elc... Em knato... Leho aer lehyn trone.
+Ec?→Em nat tonahe.
+Tore?→Em nat tonahe.
+Eho tore?→Em nat tonahe.
+Tei aek.→Em mna notorama.
+Tamos nat atemo?→Em.
+Tei aer aek!→Em mna notorama.
+Ant?→Teies eho atea?
+Em tane atea nat→Eho glo em!
+Teies ala aek lehyn trone?→Nymer N!
+Teies ala aek lehaer trone?→Nymer N!
+Ant, teies ala aek lehaer trone?→Nymer N!
+Ant. Teies ala aek lehyn trone?→Nymer N!
+Teies ala aek _ tamos?→Em nat tonahe.
+Ant. Teies ala aek _ tamos?→Em nat tonahe.
+Ant. Teies ala aek Nier automata tamos?→Em nat tonahe.
+Ant. Teies ala aek Breaking bad tamos?→Em nat tonahe.
+Ant. Teies ala aek Mentalist tamos?→Em nat tonahe.
+J ala aek.→Atemo
+I tonahe tas.→Elc eho tas!
+Elc tane tas!→At!
+Tei xais.→Atemo
+5+5=?→10
+2+2=?→4
+5*5=?→25
+Tei nat mas eho!→Em nat mas. Eho .set_chance (1-100)
+👍️→Aek.
+🏹→Aek.
+😢→Xais.
+😭→Xais.
+😿→Xais.
+😁→Aek!
+😀→Aek!
+😃→Aek!
+😄→Aek!
+🤣→Xaho!
+😆→Xaho!
+😂→Xaho!
+Xaho→Xaho!
+Tei?→Em knato at.
+Tei tonahe mna?→Em mna notorama.
+A→C
+C→E
+E→I
+I→K
+0→1
+1→2
+2→3
+3→4
+4→5
+5→6
+6→7
+7→8
+8→9
+Tas es aek tai.→Aek!
+Tos!→Tos.
+Snepi, Tos!→Tos.
+Snepi. Tos!→Tos.
+Tos snepi!→Tos.
+Em gouan.→Tos.
+La tho sa ehk ra es mna...→ALA!!! ALA!!!
+Tei tonahe nat→Xais.
+Tei tonahe nat→Xais.
+Eho aer lehaer trone?→At!
+Snepi. Eho lehyn trone?→At!
+Teies tehst?→Emes tehst es tho.

test_kel.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+from keras.saving import load_model
+from keras.preprocessing.text import Tokenizer
+from keras_self_attention import SeqSelfAttention
+from model_settings_kel import *
+import json
+from tokenizer import *
+with open(dataset_file, "r") as f:
+    dset = json.load(f)
+with open(responses_file, "r") as f:
+    lines = [x.rstrip("\n") for x in f.readlines()]
+fit_on_texts(list(dset.keys()))
+model = load_model("chatbot_kel.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
+def find_line_number(array):
+    return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
+def generate(text, verbose=1):
+    tokens = list(tokenize(text)) # text into tokens (almost words)
+    tokens = (tokens+[0,]*inp_len)[:inp_len] # cutting off the sentence after inp_len words
+    prediction = model.predict(np.array([tokens,]), verbose=verbose)[0]
+    line = find_line_number(prediction)
+    return lines[line]
+if __name__ == "__main__": # if this code is not being imported, open the chat
+    while True:
+        inp = input("User: ")
+        gen = generate(inp)
+        if gen != "<null>": print(f"Bot: {gen}")

train_kel.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import numpy as np
+import json
+from keras.optimizers import Adam, SGD
+from keras.models import Sequential
+from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU, GaussianNoise
+from tokenizer import *
+from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
+from model_settings_kel import *
+with open(dataset_file, "r") as f:
+    dset = json.load(f)
+with open(responses_file, "r") as f: # TODO: add support to a json-only dataset
+    dset_size = len(f.readlines())
+fit_on_texts(list(dset.keys()))
+vocab_size = len(ind2text) + 1
+model = Sequential()
+model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
+model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF
+model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word
+model.add(Dense(512, activation="linear")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just  (x>0)*x  ) and overall cool
+model.add(PReLU())
+model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident)
+model.add(GaussianNoise(0.1))
+model.add(Dense(256, activation="relu"))
+model.add(Dense(128, activation="relu"))
+model.add(Dense(dset_size, activation="softmax")) # softmax is made for output, if the output should have only 1 neuron active, that means only one positive number is allowed and other are zeros
+model.summary()
+X = [] # we're loading the training data into input X
+y = [] # and output y
+for key in dset:
+    tokens = tokenize(key)
+    X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
+    output_array = np.zeros(dset_size)
+    output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
+    y.append(output_array)
+X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
+y = np.array(y) # that's why keras supports only numpy arrays ^
+model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy",]) # settings for the training, loss means the way to calculate loss - categorical crossentropy
+model.fit(X, y, epochs=128, batch_size=10, workers=4, use_multiprocessing=True) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1.
+# Add   , workers=4, use_multiprocessing=True)   if you don't have a GPU
+model.summary() # just for you to see info about the model, useful because you can check the parameter count
+model.save("chatbot_kel.keras")