File size: 1,415 Bytes
72f5b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def load_pdf(file_path):
    """Extract text from a PDF file"""
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + " "
    return text

def load_all_pdfs(folder="notes"):
    """Load and merge text from all PDFs in a folder"""
    all_chunks = []
    sources = []
    
    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            subject = file.replace(".pdf", "")
            print(f"📖 Loading {subject} ...")
            text = load_pdf(os.path.join(folder, file))

            # Split into chunks
            chunks = [text[i:i+500] for i in range(0, len(text), 500)]
            all_chunks.extend(chunks)
            sources.extend([subject] * len(chunks))  # Keep track of subject
    
    return all_chunks, sources

def create_vector_store(chunks):
    """Create embeddings and FAISS index"""
    embeddings = embedder.encode(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

# Load all PDFs
chunks, sources = load_all_pdfs("notes")
index = create_vector_store(chunks)