I am new to cuda and numba. My first goal was to write a hash-cracker on my gpu RTX 2070S. That didn't worked for me at first, so I wrote it for my cpu Intel i7 10700kf. Later I read some numpy documentation and rewrote the code for my gpu. Now when I let the two scripts compete, the cpu code generates the password lists much faster, probably because of the transformation from hash to array. Can someone who is more advanced help me optimize the code for the gpu.
# cpu.py
import gc
import hashlib
import itertools
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from numba import njit
@njit
def cpu_kernel(hashed: np.ndarray, un_hashed: np.ndarray, md5: str):
for i in range(hashed.size):
if hashed[i] == md5:
return un_hashed[i]
return None
def get_password_possibilities(signs_length, min_length, max_length):
c = 1
for value in range(min_length, max_length + 1):
val = 1
for i in range(value):
val *= signs_length
c += val
return c - 1
def generate_password_list(signs, min_len, max_len):
# print(min_len, max_len)
hashed_pwds, normal_pwds = [], []
counter = -1
for i in range(signs, signs + 1):
for c in map(''.join, itertools.product(LETTERS, repeat=i)):
counter += 1
if counter > max_len:
break
if counter >= min_len:
hashed_pwds.append(hashlib.md5(c.encode("utf-8")).hexdigest())
counter = -1
for i in range(signs, signs + 1):
for c in map(''.join, itertools.product(LETTERS, repeat=i)):
counter += 1
if counter > max_len:
break
if counter >= min_len:
normal_pwds.append(c)
return normal_pwds, hashed_pwds
def try_crack(passwords, hashs, search):
passwords_hashed = np.array(hashs)
passwords = np.array(passwords)
# print(f"Started cracking '{hash_to_crack}' with {current_length} signs")
result = cpu_kernel(passwords_hashed, passwords, search)
del passwords_hashed, passwords
gc.collect()
if result is not None:
print(f"Successfully cracked {hash_to_crack}, {result} in {time.time() - find_timer} seconds")
exit()
LETTERS = "abcdefghijklmnopqrstuvwxyz"
hash_to_crack = hashlib.md5("sabine".encode("utf-8")).hexdigest()
print(hash_to_crack)
current_length = 1
pwd_amount = 10750000
find_timer = time.time()
while True:
# print("Before", process.memory_info().rss / 1000000)
pwd_idx = 0
possibilities = get_password_possibilities(len(LETTERS), current_length, current_length)
continue_to_next = False
print(f"Checking {current_length} signs, with {(possibilities / 1000000).__round__(2)} mio possible passwords")
sign_start = time.time()
while not continue_to_next:
start = time.time()
with ThreadPoolExecutor() as executor:
res = executor.submit(generate_password_list, current_length, pwd_idx * pwd_amount,
(pwd_idx + 1) * pwd_amount)
while not res.done():
time.sleep(0.05)
length = len(res.result()[0])
print(f"Generated {length} passwords in {time.time() - start} seconds")
try_crack(res.result()[0], res.result()[1], hash_to_crack)
del res
gc.collect()
if length < pwd_amount:
continue_to_next = True
else:
pwd_idx += 1
print(f"Running batch {pwd_idx + 1}")
print(f"Checking {current_length} signs took {time.time() - sign_start} seconds
")
current_length += 1
# gpu.py
import numpy as np
from numba import cuda
import hashlib
import itertools
import time
import gc
from concurrent.futures import ThreadPoolExecutor
@cuda.jit
def gpu_kernel(target: np.ndarray, hashes: np.ndarray, res: np.ndarray):
pos = cuda.grid(1)
for i in range(target.shape[0]):
if target[i] == hashes[pos][i]:
if i == target.shape[0] - 1:
res[0] = pos
else:
break
def generate_password_list(signs, min_len, max_len):
hashed_passwords, normal_passwords = [], []
counter = -1
for i in range(signs, signs + 1):
for c in map(''.join, itertools.product(LETTERS, repeat=i)):
counter += 1
if counter > max_len:
break
if counter >= min_len:
hashed_passwords.append(np.fromstring(hashlib.md5(c.encode("utf-8")).hexdigest(), dtype=np.uint8))
counter = -1
for i in range(signs, signs + 1):
for c in map(''.join, itertools.product(LETTERS, repeat=i)):
counter += 1
if counter > max_len:
break
if counter >= min_len:
normal_passwords.append(np.fromstring(c, dtype=np.uint8))
return normal_passwords, hashed_passwords
def get_password_possibilities(signs_length, min_length, max_length):
c = 1
for value in range(min_length, max_length + 1):
val = 1
for i in range(value):
val *= signs_length
c += val
return c - 1
def main():
hash_to_crack = hashlib.md5("sabine".encode("utf-8")).hexdigest()
hash_arr = np.fromstring(hash_to_crack, dtype=np.uint8)
# print(hash_arr)
current_length = 1
pwd_amount = 10000000
find_timer = time.time()
while True:
# print("Before", process.memory_info().rss / 1000000)
pwd_idx = 0
possibilities = get_password_possibilities(len(LETTERS), current_length, current_length)
continue_to_next = False
print(f"Checking {current_length} signs, with {(possibilities / 1000000).__round__(2)} mio possible passwords")
sign_start = time.time()
while not continue_to_next:
start = time.time()
with ThreadPoolExecutor() as executor:
res = executor.submit(generate_password_list, current_length, pwd_idx * pwd_amount,
(pwd_idx + 1) * pwd_amount)
while not res.done():
time.sleep(0.1)
length = len(res.result()[0])
print(f"Generated {length} passwords in {time.time() - start} seconds")
# print([_.tobytes() for _ in passwords[-20:]])
passwords_array = np.array(res.result()[0]) # Create array 1
hashes_array = np.array(res.result()[1]) # Create array 2
del res # Delete for memory
gc.collect()
blocks_per_grid = (passwords_array.shape[0] // 32) + 1
print("Started cracking")
res = np.array([-1])
threads_per_block = 32
blocks_per_grid = (length // 32) + 1
gpu_kernel[blocks_per_grid, threads_per_block](hash_arr, hashes_array, res)
if res[0] != -1:
print("Hash is", passwords_array[res[0]].tobytes())
print(f"Cracking took {time.time() - find_timer}")
exit()
if length < pwd_amount:
continue_to_next = True
else:
pwd_idx += 1
print(f"Running batch {pwd_idx + 1}")
print(f"Checking {current_length} signs took {time.time() - sign_start} seconds
")
current_length += 1
if __name__ == '__main__':
LETTERS = "abcdefghijklmnopqrstuvwxyz"
main()
I think the main problems on this code are, that it generates the howle list even if it only reads the last 10 mio passwords and that it runs only on one or two threads.
question from:
https://stackoverflow.com/questions/66062347/numba-njit-code-runs-faster-than-numba-cuda-jit