import os
import random
import glob
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.memory import ConversationSummaryMemory
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from chromadb.utils import embedding_functions
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

import warnings
warnings.filterwarnings('ignore')

vectorizer = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorizer.embed_query("dog")[0:10]

[-0.05314699560403824,
 0.014194400049746037,
 0.007145748008042574,
 0.06860868632793427,
 -0.07848034799098969,
 0.01016747672110796,
 0.10228313505649567,
 -0.01206485740840435,
 0.09521342068910599,
 -0.030350159853696823]

def similarity_two_queries(word1, word2):
    # HINT:
    # Use vectorizer.embed_query(<text>) to embed text.
    # Use np.dot to find the cosine similarity/dot product of 2 vectors
    # TODO

    return None

print("Similarity of 'kitten' and 'cat': ",similarity_two_queries("kitten","cat"))
print("Similarity of 'kitten' and 'dog': ",similarity_two_queries("kitten","dog"))

def similarity_list(word,words):
    similarity_list = [(i,similarity_two_queries("color",i)) for i in words]
    sorted_similarity_list = sorted(similarity_list,key=lambda x:x[1],reverse=True)
    return sorted_similarity_list

words = ["rainbow","car","black","red","cat","tree"]

# TODO: Which words are most similar to color?

def match_queries_with_texts(queries, texts):
    # Calculate similarities between each query and text
    similarities = np.zeros((len(queries), len(texts)))
    
    for i, query in enumerate(queries):
        for j, text in enumerate(texts):
            similarities[i, j] = similarity_two_queries(query, text)
    
    # Match each query to the text with the highest similarity
    matches = {}
    for i, query in enumerate(queries):
        best_match_idx = np.argmax(similarities[i])
        matches[query] = texts[best_match_idx]
    
    return matches

# TODO: Fill in the list to make suitable question-text pairs.

queries = ["What are the 7 colors of the rainbow?", 
           "What does Elsie do for work?", 
           "Which country has the largest population?",
           "-- INSERT QUERY 1 HERE--",
           "-- INSERT QUERY 2 HERE--",
           "-- INSERT QUERY 3 HERE--"]
texts = ["China has 1.4 billion people.",
         "Elsie works the register at Arby's.", 
         "The colors of the rainbow are ROYGBIV.",
         "-- INSERT TEXT 1 HERE--",
         "-- INSERT TEXT 2 HERE--",
         "-- INSERT TEXT 3 HERE--"]

import random
random.shuffle(queries)
random.shuffle(texts)

match_queries_with_texts(queries, texts)

{'What are the 7 colors of the rainbow?': 'China has 1.4 billion people.',
 'What does Elsie do for work?': 'China has 1.4 billion people.',
 'Which country has the largest population?': 'China has 1.4 billion people.',
 '-- INSERT QUERY 2 HERE--': 'China has 1.4 billion people.',
 '-- INSERT QUERY 3 HERE--': 'China has 1.4 billion people.',
 '-- INSERT QUERY 1 HERE--': 'China has 1.4 billion people.'}

ids = list(range(len(texts)))
random_id = random.randint(100000, 999999)
db = Chroma.from_texts(texts, vectorizer, metadatas=[{"id": id} for id in ids],collection_name=f"temp_{random_id}")
retriever = db.as_retriever(search_kwargs={"k": 1})

texts

['China has 1.4 billion people.',
 'The colors of the rainbow are ROYGBIV.',
 '-- INSERT TEXT 1 HERE--',
 "Elsie works the register at Arby's.",
 '-- INSERT TEXT 3 HERE--',
 '-- INSERT TEXT 2 HERE--']

retriever.invoke("Which country has the largest population?")

with open("workplaces.txt", 'r') as file:
    lines = file.readlines()
lines = [line.strip() for line in lines]
print(lines[0:4])

def workplace_retriever(k=3):
    with open("workplaces.txt", 'r') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    
    db = Chroma.from_texts(
        lines,
        vectorizer,
        metadatas=[{"id": id} for id in range(len(lines))],
        collection_name=f"temp_{id(lines)}"
    )
    
    retriever = db.as_retriever(search_kwargs={"k": k})
    return retriever

# TODO: Find out who works at Starbucks and McDonalds. Use the retriever(k=3).invoke(<query>) method to do this
# Remember to experiment with the value of k to make sure you find all people that work in one place.

with open("expanse_doc_1.txt", 'r') as file:
    lines = file.readlines()
lines = [line.strip() for line in lines]
print(lines[20:35])

def expanse_retriever(chunk_size):
    loader = TextLoader('expanse_doc_1.txt')
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=10, separators=[" ", ",", "\n"])
    texts = text_splitter.split_documents(documents)
    db = Chroma(embedding_function=vectorizer, collection_name=f"expanse_temp_{id(texts)}")
    db.add_documents(texts)
    retriever = db.as_retriever(search_kwargs={"k": 3})
    return retriever

# TODO: Think about how many characters would be needed to contain useful information for such a complex task

def expanse_all_retriever(chunk_size):
    random_id = random.randint(100000, 999999)  # random 6-digit ID for uniqueness

    db = Chroma(
        embedding_function=vectorizer,
        collection_name=f"expanse_all_temp_{random_id}"
    )

    pattern = 'expanse_doc_*.txt'
    file_list = glob.glob(pattern)

    for file_name in file_list:
        loader = TextLoader(file_name)
        documents = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=10,
            separators=[" ", ",", "\n"]
        )
        texts = text_splitter.split_documents(documents)

        for i, text in enumerate(texts):
            text.metadata["chunk_number"] = i

        db.add_documents(texts)

    retriever = db.as_retriever(search_kwargs={"k": 3})
    return retriever

# TODO: Find the relevant source for the query "Compiling Codes"

chunks = expanse_all_retriever(1000).invoke("Compiling Codes")
for chunk in chunks:
    print(chunk.metadata)

import os
from ollama import Client

# Read the port from the file
with open(os.path.expanduser('~/.ollama_port')) as f:
    port = f.read().strip()

# Connect to 127.0.0.1:<port>
host = f"http://127.0.0.1:{port}"

client = Client(host=host)

# Get LLM
client.pull("gemma3:4b")

llm = Ollama(
    model="gemma3:4b",
    base_url=f"http://127.0.0.1:{port}",  # CRITICAL: Use your custom port
    temperature=0
)

llm.invoke("How are you?")

# TODO
def workplace_question(question):
    retriever = #TODO: assign the retriever
    context = #TODO: invoke the retriever here
    llm = Ollama(model="gemma3:4b",base_url=f"http://127.0.0.1:{port}",temperature="0.2")
    prompt = f"Based on the following context: {context}, answer the question: "
    response = #TODO: invoke ollama with the prompt and question
    return response

print(workplace_question("Who are the people that work at Starbucks?"))

def generate_rag(verbose=False, chunk_info=False):
    import glob
    random_id = random.randint(100000, 999999)
    vectorizer = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(embedding_function=vectorizer, collection_name=f"expanse_all_temp_{random_id}")
    pattern = 'expanse_doc_*.txt'
    file_list = glob.glob(pattern)
    for file_name in file_list:
        loader = TextLoader(file_name)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10, separators=[" ", ",", "\n"])
        texts = text_splitter.split_documents(documents)
        for id,text in enumerate(texts):
            text.metadata["chunk_number"] = id
        db.add_documents(texts)
    
    template = """<s>[INST] Given the context - {context} </s>[INST] [INST] Answer the following question - {question}[/INST]"""
    pt = PromptTemplate(
                template=template, input_variables=["context", "question"]
            )
    # Let's retrieve the top 3 chunks for our results
    retriever = db.as_retriever(search_kwargs={"k": 3})
    class CustomRetrievalQA(RetrievalQA):
        def invoke(self, *args, **kwargs):
            result = super().invoke(*args, **kwargs)
            if chunk_info:
                # Print out the chunks that were retrieved
                print("Chunks being looked at:")
                chunks = retriever.invoke(*args, **kwargs)
                for chunk in chunks:
                    print(f"Source: {chunk.metadata['source']}, Chunk number: {chunk.metadata['chunk_number']}")
                    print(f"Text snippet: {chunk.page_content[:200]}...\n")  # Print the first 200 characters
            return result
    rag = CustomRetrievalQA.from_chain_type(
        llm=Ollama(model="gemma3:4b",base_url=f"http://127.0.0.1:{port}", temperature="0"),
        retriever=retriever,
        memory=ConversationSummaryMemory(llm=Ollama(model="gemma3:4b", base_url=f"http://127.0.0.1:{port}")),
        chain_type_kwargs={"prompt": pt, "verbose": verbose},
    )

    return rag

print(llm.invoke("How can a user check their resource allocations and the resources they can use on the Expanse supercomputer"))
#Try "How can a user check their resource allocations and the resources they can use on the Expanse supercomputer"

expanse_rag = generate_rag()
result = expanse_rag.invoke("How can a user check their resource allocations and the resources they can use on the Expanse supercomputer")
print(result["result"])

expanse_rag = generate_rag(verbose=True)
result = expanse_rag.invoke("How can a user check their resource allocations and the resources they can use on the Expanse supercomputer")
print(result["result"])

expanse_rag = generate_rag(chunk_info=True)
result = expanse_rag.invoke("How can a user check their resource allocations and the resources they can use on the Expanse supercomputer")

print(result["result"])

RAG – Retrieval Augmented Generation¶

CIML Summer 2025¶

Setup: LangChain¶

Part 1: Retrieval¶

Vectorizing¶

Task 1¶

`similarity_two_queries`¶

Task 2¶

Task 3¶

Vector Store¶

Task 4¶

Chunking¶

Task 5¶

Multiple Document Chunking¶

Task 6¶

Part 2: Basic RAG¶

Task 7¶

Part 3: LangChain RAG¶

Task 8¶

RAG – Retrieval Augmented Generation¶

CIML Summer 2025¶

Setup: LangChain¶

Part 1: Retrieval¶

Vectorizing¶

Task 1¶

similarity_two_queries¶

Task 2¶

Task 3¶

Vector Store¶

Task 4¶

Chunking¶

Task 5¶

Multiple Document Chunking¶

Task 6¶

Part 2: Basic RAG¶

Task 7¶

Part 3: LangChain RAG¶

Task 8¶

`similarity_two_queries`¶