SLM

2026-01-06 21:02:40 +01:00
commit 85d19cbaad
20 changed files with 101786 additions and 0 deletions
--- a/llm.py
+++ b/llm.py
@@ -0,0 +1,188 @@
+import os
+import random
+import sys
+import re
+import hashlib
+import pickle
+from collections import defaultdict, Counter
+
+SOURCES_DIR = "sources"
+CACHE_DIR = "models"
+N_GRAM = 3  # Default N-gram for standalone script use
+
+def get_dir_checksum(directory):
+    """
+    Calculates MD5 checksum of all .txt files in the directory to detect changes.
+    """
+    hash_md5 = hashlib.md5()
+    if not os.path.exists(directory):
+        return None
+    
+    files = sorted([f for f in os.listdir(directory) if f.endswith('.txt')])
+    
+    for filename in files:
+        filepath = os.path.join(directory, filename)
+        hash_md5.update(filename.encode('utf-8'))
+        with open(filepath, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+                
+    return hash_md5.hexdigest()
+
+def train_model(sources_dir, n):
+    """
+    Trains the N-gram model from scratch.
+    Returns: model object
+    """
+    print(f"Training new {n}-gram model from sources...")
+    model = defaultdict(Counter)
+    
+    files = [f for f in os.listdir(sources_dir) if f.endswith(".txt")]
+    if not files:
+        print("No source files found!")
+        return model
+
+    for filename in files:
+        filepath = os.path.join(sources_dir, filename)
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                text = f.read()
+                text = re.sub(r'[[.*?]]', '', text)
+                words = text.split()
+                
+                if len(words) < n:
+                    continue
+                    
+                context_size = n - 1
+                for i in range(len(words) - context_size):
+                    context = tuple(words[i : i + context_size])
+                    next_word = words[i + context_size]
+                    model[context][next_word] += 1
+                    
+        except Exception as e:
+            print(f"Error processing {filename}: {e}")
+
+    return model
+
+def load_or_train_model(sources_dir, n):
+    """
+    Loads model from its dedicated cache file if checksum matches, otherwise retrains.
+    """
+    if not os.path.exists(CACHE_DIR):
+        os.makedirs(CACHE_DIR)
+
+    cache_file = os.path.join(CACHE_DIR, f"model_n{n}.pkl")
+    checksum_file = os.path.join(CACHE_DIR, f"checksum.txt") # One checksum for all
+    
+    current_checksum = get_dir_checksum(sources_dir)
+    
+    # Check if a model for this N exists and if the checksum matches
+    if os.path.exists(cache_file) and os.path.exists(checksum_file):
+        with open(checksum_file, 'r') as f:
+            saved_checksum = f.read()
+            
+        if saved_checksum == current_checksum:
+            print(f"Sources unchanged. Loading model N={n} from {cache_file}...")
+            with open(cache_file, 'rb') as f:
+                return pickle.load(f)
+        else:
+            print(f"Sources changed. Global retrain needed. Deleting old models.")
+            for item in os.listdir(CACHE_DIR):
+                os.remove(os.path.join(CACHE_DIR, item))
+
+    print(f"No valid cache found for N={n}. Training...")
+    model = train_model(sources_dir, n)
+    
+    print(f"Saving model to {cache_file}...")
+    with open(cache_file, 'wb') as f:
+        pickle.dump(model, f)
+    # Update the global checksum file after a successful train
+    with open(checksum_file, 'w') as f:
+        f.write(current_checksum or "")
+        
+    return model
+
+def generate_text(model, start_prompt, length=100, temperature=1.0):
+    """
+    Generates text using the N-gram model.
+    """
+    if not model:
+        return ""
+
+    try:
+        context_size = next(iter(model.keys())).__len__() # Get context size from model keys
+    except StopIteration:
+        return "" # Model is empty
+
+    start_words = start_prompt.split()
+    current_context = None
+    
+    if len(start_words) >= context_size:
+        potential_context = tuple(start_words[-context_size:])
+        if potential_context in model:
+            current_context = potential_context
+    
+    if current_context is None and start_words:
+        last_word = start_words[-1]
+        candidates = [k for k in model.keys() if k[0] == last_word]
+        if candidates:
+            current_context = random.choice(candidates)
+    
+    if current_context is None:
+        current_context = random.choice(list(model.keys()))
+        if not start_prompt:
+            start_prompt = ' '.join(current_context)
+    
+    generated_words = []
+    
+    for _ in range(length):
+        if current_context not in model or not model[current_context]:
+            current_context = random.choice(list(model.keys()))
+        
+        possible_next = list(model[current_context].keys())
+        counts = list(model[current_context].values())
+        
+        try:
+            if temperature == 1.0:
+                weights = counts
+            else:
+                weights = [c ** (1.0 / temperature) for c in counts]
+            next_word = random.choices(possible_next, weights=weights, k=1)[0]
+        except (ValueError, IndexError):
+            # Fallback if weights are invalid or no words are possible
+            current_context = random.choice(list(model.keys()))
+            next_word = current_context[0]
+
+        generated_words.append(next_word)
+        current_context = current_context[1:] + (next_word,)
+
+    return " ".join(generated_words)
+
+def main():
+    if not os.path.isdir(SOURCES_DIR):
+        print(f"Error: Directory '{SOURCES_DIR}' not found.")
+        sys.exit(1)
+
+    model = load_or_train_model(SOURCES_DIR, N_GRAM)
+    print(f"Model ready. (N={N_GRAM}, Keys={len(model)})")
+
+    start_prompt = ""
+    length = 100
+    temperature = 1.0
+
+    args = sys.argv[1:]
+    if not args:
+        start_ctx = random.choice(list(model.keys()))
+        start_prompt = " ".join(start_ctx)
+    else:
+        start_prompt = args[0]
+        if len(args) >= 2: length = int(args[1])
+        if len(args) >= 3: temperature = float(args[2])
+
+    print(f"\n--- Generating (Start: '{start_prompt}', Temp: {temperature}) ---\n")
+    output = start_prompt + " " + generate_text(model, start_prompt, length, temperature)
+    print(output)
+    print("\n-------------------------------------------------------------")
+
+if __name__ == "__main__":
+    main()