Level: Senior-Level
Round: Onsite · Type: Coding · Difficulty: 7/10 · Duration: 60 min · Interviewer: Neutral
Topics: Hashing, File System
Location: San Francisco Bay Area
Interview date: 2025-12-27
Question: Design an algorithm to find duplicate files with identical binary content in a simulated file system, given file system APIs for listing files, checking directory status, getting file size, and opening a binary stream.
You are tasked with finding groups of duplicate files within a simulated file system rooted at "/". Two files are considered duplicates if their complete binary contents are exactly the same, regardless of file names or paths. The challenge lies in efficiently identifying these duplicates, given that file content may be large and is accessible only as a binary stream.
You need to implement the DuplicateFileFinder class with the following methods:
DuplicateFileFinder(FileSystem fs): Initializes the file system with a list of files.List<List<String>> findDuplicateFiles(): Traverses the file system from the root and returns a list of groups, where each group contains two or more file paths with identical content. The order of groups and file paths within a group is not important.You are provided with the following FileSystem class and its methods:
` class FileSystem { /* List all files in the directory */ List<String> listFiles(String path){...}
/* Check if the path is a directory */
boolean isDirectory(String path){...}
/* Get the size of the file in bytes */
int getFileSize(String path){...}
/* Open the file and return a binary stream */
InputStream openStream(String path){...}
} `
Input:
["DuplicateFileFinder", "findDuplicateFiles"] [[["/a/x.txt", "hello world"], ["/c/y.txt", "hello world"], ["/b/z.txt", "unique content"]]], []]
Output:
[null, [["/a/x.txt", "/c/y.txt"]]]
Explanation:
File Path Content "/a/x.txt" "hello world" "/c/y.txt" "hello world" "/b/z.txt" "unique content"
DuplicateFileFinder finder = new DuplicateFileFinder(fs); // Initialize with a file system containing the specified files. finder.findDuplicateFiles(); // Returns [["/a/x.txt", "/c/y.txt"]].
` import hashlib from io import BytesIO from typing import List, Dict, Optional
class FSNode: def init(self, isDir=None, content=None): if content is not None: self.isDir = False self.content = content else: self.isDir = isDir if isDir is not None else True if self.isDir: self.children = {}
class FileSystem: def init(self): self.root = FSNode(True)
# Add a file from string content
def addFile(self, path: str, content: str):
self.addFileBytes(path, content.encode())
# Add a file from raw bytes
def addFileBytes(self, path: str, content: bytes):
parts = self.splitPath(path)
curr = self.root
for i in range(len(parts) - 1):
part = parts[i]
if part not in curr.children:
curr.children[part] = FSNode(True)
curr = curr.children[part]
fileName = parts[len(parts) - 1]
curr.children[fileName] = FSNode(content=content)
def listFiles(self, path: str) -> List[str]:
node = self.getNode(path)
if node is None or not node.isDir:
return []
result = []
prefix = path if path.endswith("/") else path + "/"
for name in node.children.keys():
result.append(prefix + name)
return result
def isDirectory(self, path: str) -> bool:
node = self.getNode(path)
return node is not None and node.isDir
def getFileSize(self, path: str) -> int:
node = self.getNode(path)
if node is None or node.isDir:
return 0
return len(node.content)
def openStream(self, path: str):
node = self.getNode(path)
if node is None or node.isDir:
return BytesIO(b'')
return BytesIO(node.content)
def getNode(self, path: str) -> Optional[FSNode]:
if path == "/":
return self.root
parts = self.splitPath(path)
curr = self.root
for part in parts:
if curr is None or not curr.isDir or part not in curr.children:
return None
curr = curr.children[part]
return curr
def splitPath(self, path: str) -> List[str]:
cleaned = path
if cleaned.startswith("/"):
cleaned = cleaned[1:]
if cleaned.endswith("/"):
cleaned = cleaned[:-1]
if not cleaned:
return []
return cleaned.split("/")
class DuplicateFileFinder: def init(self, fs: FileSystem): self.fs = fs
def findDuplicateFiles(self) -> List[List[str]]:
# Collect all file paths via DFS traversal
allFiles = []
self.collectFiles("/", allFiles)
# Group files by size (only same-size files can be duplicates)
sizeMap = {}
for filePath in allFiles:
size = self.fs.getFileSize(filePath)
if size not in sizeMap:
sizeMap[size] = []
sizeMap[size].append(filePath)
# For each size group with 2+ files, compute hash and group by hash
result = []
for sizeGroup in sizeMap.values():
if len(sizeGroup) < 2:
continue
hashMap = {}
for filePath in sizeGroup:
hash = self.computeHash(filePath)
if hash not in hashMap:
hashMap[hash] = []
hashMap[hash].append(filePath)
# Collect groups with 2+ files
for group in hashMap.values():
if len(group) >= 2:
group.sort()
result.append(group)
return result
def collectFiles(self, path: str, files: List[str]):
if not self.fs.isDirectory(path):
files.append(path)
return
children = self.fs.listFiles(path)
for child in children:
self.collectFiles(child, files)
def computeHash(self, filePath: str) -> str:
try:
md = hashlib.sha256()
stream = self.fs.openStream(filePath)
buffer = bytearray(4096)
while True:
bytesRead = stream.readinto(buffer)
if bytesRead is None or bytesRead == 0:
break
md.update(buffer[:bytesRead])
stream.close()
digest = md.digest()
sb = ""
for b in digest:
sb += f"{b:02x}"
return sb
except Exception as e:
raise RuntimeError(e)
class Solution: @staticmethod def main(): Solution.test1() Solution.test2() Solution.test3()
@staticmethod
def test1():
print("===== Test 1 =====")
fs = FileSystem()
fs.addFile("/a/x.txt", "hello world")
fs.addFile("/c/y.txt", "hello world")
fs.addFile("/b/z.txt", "unique content")
finder = DuplicateFileFinder(fs)
print(finder.findDuplicateFiles())
# Expected: [["/a/x.txt", "/c/y.txt"]]
@staticmethod
def test2():
print("===== Test 2 =====")
fs = FileSystem()
fs.addFile("/dir1/a.txt", "content A")
fs.addFile("/dir2/b.txt", "content A")
fs.addFile("/dir3/c.txt", "content A")
largeContent = "This is a longer piece of content for testing hash computation with more data."
fs.addFile("/docs/report.txt", largeContent)
fs.addFile("/backup/report_copy.txt", largeContent)
fs.addFile("/unique/file1.txt", "unique 1")
fs.addFile("/unique/file2.txt", "unique 2")
finder = DuplicateFileFinder(fs)
print(finder.findDuplicateFiles())
# Expected: [["/backup/report_copy.txt", "/docs/report.txt"], ["/dir1/a.txt", "/dir2/b.txt", "/dir3/c.txt"]]
@staticmethod
def test3():
print("===== Test 3 =====")
fs = FileSystem()
fs.addFile("/empty/a.txt", "")
fs.addFile("/empty/b.txt", "")
fs.addFile("/empty/c.txt", "")
fs.addFile("/same_size/x.txt", "abc")
fs.addFile("/same_size/y.txt", "xyz")
fs.addFile("/solo/only.txt", "I am alone")
finder = DuplicateFileFinder(fs)
print(finder.findDuplicateFiles())
# Expected: [["/empty/a.txt", "/empty/b.txt", "/empty/c.txt"]]
if name == "main": Solution.main() `
LeetCode similar: LeetCode 609