This Apple Machine Learning Engineer phone screen asks you to build a tiny similarity search engine using the bag-of-words (BoW) representation, from scratch, without scikit-learn or any other machine learning libraries.
Given a set of documents, implement a bag-of-words similarity search engine that can find the most similar document to a given query document. Here are the steps you need to follow:
Let's say we have the following documents:
docs = [ "The quick brown fox jumps over the lazy dog", "Never jump over the lazy dog quickly", "The dog is lazy, but the fox is quick" ]
And the query document:
query = "Is the fox quick or lazy?"
The expected output would be the index of the most similar document to the query.
Here's a high-level overview of the solution:
Tokenization and Preprocessing:
Create Vocabulary:
Vectorization:
Cosine Similarity:
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
# Initialize stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
# Preprocess and vectorize documents
def preprocess(doc):
words = doc.lower().translate(str.maketrans('', '', string.punctuation)).split()
words = [stemmer.stem(word) for word in words if word not in stop_words]
return words
def create_vocabulary(docs):
vocabulary = set()
for doc in docs:
vocabulary.update(preprocess(doc))
return {word: i for i, word in enumerate(vocabulary)}
def vectorize(doc, vocabulary):
words = preprocess(doc)
return [1 if word in words else 0 for word in vocabulary]
# Compute cosine similarity
def cosine_similarity(vec1, vec2):
dot_product = np.dot(vec1, vec2)
magnitude1 = np.linalg.norm(vec1)
magnitude2 = np.linalg.norm(vec2)
return dot_product / (magnitude1 * magnitude2)
# Main function
def bag_of_words_similarity_search(docs, query):
vocabulary = create_vocabulary(docs)
query_vector = vectorize(query, vocabulary)
doc_vectors = [vectorize(doc, vocabulary) for doc in docs]
similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_vectors]
return similarities.index(max(similarities))
# Example usage
docs = [
"The quick brown fox jumps over the lazy dog",
"Never jump over the lazy dog quickly",
"The dog is lazy, but the fox is quick"
]
query = "Is the fox quick or lazy?"
index = bag_of_words_similarity_search(docs, query)
print(f"Most