from concurrent.futures import ThreadPoolExecutor

def matrix_multiply_parallel(A, B, num_threads=1):
    n = len(A)
    result = [[0] * n for _ in range(n)]

    def worker(start, end):
        for i in range(start, end):
            for j in range(n):
                result[i][j] = sum(A[i][k] * B[k][j] for k in range(n))

    chunk_size = n // num_threads
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [
            executor.submit(worker, i * chunk_size, (i + 1) * chunk_size)
            for i in range(num_threads)
        ]
        for future in futures:
            future.result()

    return result