Faster and reproducible ML results

Any machine learning researcher have to find an effective work-flow for fast experimentation and for reproducible results. This starts by fixing the random seeds

import os
import random
import numpy as np
import torch
def fix_randseeds(seed=1234):
    try:
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
    except Exception as e:
        print(e)
        pass

Also, in many cases one needs to disable numpy threads and use parallel tools provided by the machine learning framework (e.g. pytorch) or use a python multiprocessing library. This could also speeds up standard machine learning libraries (sklearn, lgbm, etc.)

import os
def force_single_thread():
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"