Plot not showing, modified rdfpy source code after Runtime error

Hello, I am trying to compute the radial distribution function for a system of atoms I have in an xyz file. I tried using rdfpy (Introduction and Examples — rdfpy 1.0.0 documentation), only to get a runtime error.

It initially looked like this:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdfpy import rdf
from pymatgen.core.structure import Structure,Molecule,Lattice

system = Molecule.from_file(r'C:\Users\bnzmi\OneDrive - Northeastern University\Lustig Lab\RDF Attempts\hybrid7-pos-1.xyz')

a = 11
b = 11
c = 11
structure = Structure(
    Lattice([[a, 0., 0.], [0., b, 0.], [0., 0., c]]),
    species=system.species,
    coords=system.cart_coords,
    coords_are_cartesian=True   
)
structure.make_supercell(1)
coords = structure.cart_coords
noise = np.random.normal(loc=0.0, scale=0.05, size=(coords.shape))
coords = coords + noise
g_r, radii = rdf(coords, dr=0.05)

However, I only got this Runtime error:

An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.

After following advice on Github, I tried adding this to my code:

import sys
import pandas as pd
import numpy as np
import matplotlib as plt
from rdfpy import rdf
from pymatgen.core.structure import Molecule

def main():
    coords = structure.cart_coords
    print(coords)
    noise = np.random.normal(loc=0.0, scale=0.05, size=(coords.shape))
    coords = coords + noise

    g_r, radii = rdf(coords, dr=0.05)

if __name__ == "__main__":
    sys.exit(main())

I tried using it with rdfpy, but I was not quite sure how to plot g_r vs radii. Instead, I sought to change the rdfpy source data: rdfpy/rdfpy/rdfpy.py at master · by256/rdfpy · GitHub.

Eventually, this is my code, and it is giving me a Recursion error (maximum recursion depth exceeded).


import sys
import os
import time
import numpy as np
from multiprocessing import Pool
from scipy.spatial import cKDTree
from pymatgen.core.structure import Structure,Molecule,Lattice

def paralell_hist_loop(radii_and_indices, kdtree, particles, mins, maxs, N_radii, dr, eps, rho):
    """RDF histogram loop process for multiprocessing"""
    N, d = particles.shape
    g_r_partial = np.zeros(shape=(N_radii))
    for r_idx, r in radii_and_indices:
        r_idx = int(r_idx)
        # find all particles that are at least r + dr away from the edges of the box
        valid_idxs = np.bitwise_and.reduce([(particles[:, i]-(r+dr) >= mins[i]) & (particles[:, i]+(r+dr) <= maxs[i]) for i in range(d)])
        valid_particles = particles[valid_idxs]
        
        # compute n_i(r) for valid particles.
        for particle in valid_particles:
            n = kdtree.query_ball_point(particle, r+dr-eps, return_length=True) - kdtree.query_ball_point(particle, r, return_length=True)
            g_r_partial[r_idx] += n
        
        # normalize
        n_valid = len(valid_particles)
        shell_vol = (4/3)*np.pi*((r+dr)**3 - r**3) if d == 3 else np.pi*((r+dr)**2 - r**2)
        g_r_partial[r_idx] /= n_valid*shell_vol*rho
    
    return g_r_partial
    
def main():
    system = Molecule.from_file(r'C:\Users\bnzmi\OneDrive - Northeastern University\Lustig Lab\RDF Attempts\hybrid7-pos-1.xyz')
    a = 11
    b = 11
    c = 11

    structure = Structure(
        Lattice([[a, 0., 0.], [0., b, 0.], [0., 0., c]]),
        species=system.species,
        coords=system.cart_coords,
        coords_are_cartesian=True   
        )

    structure.make_supercell(1)
    coords = structure.cart_coords
    noise = np.random.normal(loc=0.0, scale=0.05, size=(coords.shape))
    coords = coords + noise

    g_r, radii = rdf(coords, dr=0.05)
    
    return g_r, radii


def rdf(particles, dr, rho=None, rcutoff=0.9, eps=1e-15, parallel=True, progress=False):
    """
    Computes 2D or 3D radial distribution function g(r) of a set of particle 
    coordinates of shape (N, d). Particle must be placed in a 2D or 3D cuboidal 
    box of dimensions [width x height (x depth)].
    
    Parameters
    ----------
    particles : (N, d) np.array
        Set of particle from which to compute the radial distribution function 
        g(r). Must be of shape (N, 2) or (N, 3) for 2D and 3D coordinates 
        repsectively.
    dr : float
        Delta r. Determines the spacing between successive radii over which g(r)
        is computed.
    rho : float, optional
        Number density. If left as None, box dimensions will be inferred from 
        the particles and the number density will be calculated accordingly.
    rcutoff : float
        radii cutoff value between 0 and 1. The default value of 0.9 means the 
        independent variable (radius) over which the RDF is computed will range 
        from 0 to 0.9*r_max. This removes the noise that occurs at r values 
        close to r_max, due to fewer valid particles available to compute the 
        RDF from at these r values.
    eps : float, optional
        Epsilon value used to find particles less than or equal to a distance 
        in KDTree.
    parallel : bool, optional
        Option to enable or disable multiprocessing. Enabling this affords 
        significant increases in speed.
    progress : bool, optional
        Set to False to disable progress readout (only valid when 
        parallel=False).
        
    
    Returns
    -------
    g_r : (n_radii) np.array
        radial distribution function values g(r).
    radii : (n_radii) np.array
        radii over which g(r) is computed
    """

    if not isinstance(particles, np.ndarray):
        particles = np.array(particles)
    # assert particles array is correct shape
    shape_err_msg = 'particles should be an array of shape N x d, where N is \
                     the number of particles and d is the number of dimensions.'
    assert len(particles.shape) == 2, shape_err_msg
    # assert particle coords are 2 or 3 dimensional
    assert particles.shape[-1] in [2, 3], 'RDF can only be computed in 2 or 3 \
                                           dimensions.'
    
    start = time.time()

    mins = np.min(particles, axis=0)
    maxs = np.max(particles, axis=0)
    # translate particles such that the particle with min coords is at origin
    particles = particles - mins

    # dimensions of box
    dims = maxs - mins
    
    r_max = (np.min(dims) / 2)*rcutoff
    radii = np.arange(dr, r_max, dr)

    N, d = particles.shape
    if not rho:
        rho = N / np.prod(dims) # number density
    
    # create a KDTree for fast nearest-neighbor lookup of particles
    tree = cKDTree(particles)
    
    if __name__ == "__main__":
        sys.exit(main())

    if parallel:
        N_radii = len(radii)
        radii_and_indices = np.stack([np.arange(N_radii), radii], axis=1)
        radii_splits = np.array_split(radii_and_indices, os.cpu_count(), axis=0)
        values = [(radii_splits[i], tree, particles, mins, maxs, N_radii, dr, eps, rho) for i in range(len(radii_splits))]
        with Pool() as pool:
            results = pool.starmap(paralell_hist_loop, values)
        g_r = np.sum(results, axis=0)
    else:
        g_r = np.zeros(shape=(len(radii)))
        for r_idx, r in enumerate(radii):
            # find all particles that are at least r + dr away from the edges of the box
            valid_idxs = np.bitwise_and.reduce([(particles[:, i]-(r+dr) >= mins[i]) & (particles[:, i]+(r+dr) <= maxs[i]) for i in range(d)])
            valid_particles = particles[valid_idxs]
            
            # compute n_i(r) for valid particles.
            for particle in valid_particles:
                n = tree.query_ball_point(particle, r+dr-eps, return_length=True) - tree.query_ball_point(particle, r, return_length=True)
                g_r[r_idx] += n
            
            # normalize
            n_valid = len(valid_particles)
            shell_vol = (4/3)*np.pi*((r+dr)**3 - r**3) if d == 3 else np.pi*((r+dr)**2 - r**2)
            g_r[r_idx] /= n_valid*shell_vol*rho

            if progress:
                print('Computing RDF     Radius {}/{}    Time elapsed: {:.3f} s'.format(r_idx+1, len(radii), time.time()-start), end='\r', flush=True)

    return g_r, radii
    
g_r, radii = main()

plt.plot(radii, g_r)
plt.xlabel('Radius')
plt.ylabel('Radial Distribution Function, g(r)')
plt.title('Radial Distribution Function')
plt.show()

Does anyone have a suggestion? Thank you kindly for your time!

When using multiprocessing, what you need to remember is that when you ask it to run a function in a subprocess, it’ll create a subprocess to import the script and call the function.

That means that the script should not do any work that belongs in the main (parent) process when it’s merely imported.

Your last script, as written, will run main() when run, which will start a subprocess, which will run main() when imported, which will start a subprocess, which will run main() when imported, and so on.

Eventually, it’ll raise a recursion error.

Hi @MRAB, thank you for your explanation! I am a little confused though, to be honest - is there any way you could show what my code should be like? I am not familiar with multiprocessing at all, and seeing an example of what it should be will help me understand your explanation better.
Thank you very much for your help!

The normal structure of a script that uses multiprocessing is something like:

...imports...
...functions...

if __name__ == "__main__":
    ...things to do when run...

or:

...imports...
...functions...

def main():
   ...

if __name__ == "__main__":
    main()

You do have the if __name__ == "__main__": part, but it’s inside a function, and you have lines at the end that are always run, even when the script is imported, e.g., g_r, radii = main(). That’s the problem.

Hello Matthew @MRAB, thank you for your help and patience.
This is how I changed my script by moving if __name__ == "__main__", but I am still getting the same error.

import numpy as np
import matplotlib.pyplot as plt

import sys

import os
import time
import numpy as np
from multiprocessing import Pool
from scipy.spatial import cKDTree
from pymatgen.core.structure import Structure,Molecule,Lattice


def paralell_hist_loop(radii_and_indices, kdtree, particles, mins, maxs, N_radii, dr, eps, rho):
    """RDF histogram loop process for multiprocessing"""
    N, d = particles.shape
    g_r_partial = np.zeros(shape=(N_radii))

    for r_idx, r in radii_and_indices:
        r_idx = int(r_idx)
        # find all particles that are at least r + dr away from the edges of the box
        valid_idxs = np.bitwise_and.reduce([(particles[:, i]-(r+dr) >= mins[i]) & (particles[:, i]+(r+dr) <= maxs[i]) for i in range(d)])
        valid_particles = particles[valid_idxs]
        
        # compute n_i(r) for valid particles.
        for particle in valid_particles:
            n = kdtree.query_ball_point(particle, r+dr-eps, return_length=True) - kdtree.query_ball_point(particle, r, return_length=True)
            g_r_partial[r_idx] += n
        
        # normalize
        n_valid = len(valid_particles)
        shell_vol = (4/3)*np.pi*((r+dr)**3 - r**3) if d == 3 else np.pi*((r+dr)**2 - r**2)
        g_r_partial[r_idx] /= n_valid*shell_vol*rho
    
    return g_r_partial
    
def main():
    system = Molecule.from_file(r'C:\Users\bnzmi\OneDrive - Northeastern University\Lustig Lab\RDF Attempts\hybrid7-pos-1.xyz')
    a = 11
    b = 11
    c = 11

    structure = Structure(
        Lattice([[a, 0., 0.], [0., b, 0.], [0., 0., c]]),
        species=system.species,
        coords=system.cart_coords,
        coords_are_cartesian=True   
        )

    structure.make_supercell(1)
    coords = structure.cart_coords
    noise = np.random.normal(loc=0.0, scale=0.05, size=(coords.shape))
    coords = coords + noise

    g_r, radii = rdf(coords, dr=0.05)
    
    return g_r, radii


def rdf(particles, dr, rho=None, rcutoff=0.9, eps=1e-15, parallel=True, progress=False):
    """
    Computes 2D or 3D radial distribution function g(r) of a set of particle 
    coordinates of shape (N, d). Particle must be placed in a 2D or 3D cuboidal 
    box of dimensions [width x height (x depth)].
    
    Parameters
    ----------
    particles : (N, d) np.array
        Set of particle from which to compute the radial distribution function 
        g(r). Must be of shape (N, 2) or (N, 3) for 2D and 3D coordinates 
        repsectively.
    dr : float
        Delta r. Determines the spacing between successive radii over which g(r)
        is computed.
    rho : float, optional
        Number density. If left as None, box dimensions will be inferred from 
        the particles and the number density will be calculated accordingly.
    rcutoff : float
        radii cutoff value between 0 and 1. The default value of 0.9 means the 
        independent variable (radius) over which the RDF is computed will range 
        from 0 to 0.9*r_max. This removes the noise that occurs at r values 
        close to r_max, due to fewer valid particles available to compute the 
        RDF from at these r values.
    eps : float, optional
        Epsilon value used to find particles less than or equal to a distance 
        in KDTree.
    parallel : bool, optional
        Option to enable or disable multiprocessing. Enabling this affords 
        significant increases in speed.
    progress : bool, optional
        Set to False to disable progress readout (only valid when 
        parallel=False).
        
    
    Returns
    -------
    g_r : (n_radii) np.array
        radial distribution function values g(r).
    radii : (n_radii) np.array
        radii over which g(r) is computed
    """

    if not isinstance(particles, np.ndarray):
        particles = np.array(particles)
    # assert particles array is correct shape
    shape_err_msg = 'particles should be an array of shape N x d, where N is \
                     the number of particles and d is the number of dimensions.'
    assert len(particles.shape) == 2, shape_err_msg
    # assert particle coords are 2 or 3 dimensional
    assert particles.shape[-1] in [2, 3], 'RDF can only be computed in 2 or 3 \
                                           dimensions.'
    
    start = time.time()

    mins = np.min(particles, axis=0)
    maxs = np.max(particles, axis=0)
    # translate particles such that the particle with min coords is at origin
    particles = particles - mins

    # dimensions of box
    dims = maxs - mins
    
    r_max = (np.min(dims) / 2)*rcutoff
    radii = np.arange(dr, r_max, dr)

    N, d = particles.shape
    if not rho:
        rho = N / np.prod(dims) # number density
    
    # create a KDTree for fast nearest-neighbor lookup of particles
    tree = cKDTree(particles)

    if parallel:
        N_radii = len(radii)
        radii_and_indices = np.stack([np.arange(N_radii), radii], axis=1)
        radii_splits = np.array_split(radii_and_indices, os.cpu_count(), axis=0)
        values = [(radii_splits[i], tree, particles, mins, maxs, N_radii, dr, eps, rho) for i in range(len(radii_splits))]
        with Pool() as pool:
            results = pool.starmap(paralell_hist_loop, values)
        g_r = np.sum(results, axis=0)
    else:
        g_r = np.zeros(shape=(len(radii)))
        for r_idx, r in enumerate(radii):
            # find all particles that are at least r + dr away from the edges of the box
            valid_idxs = np.bitwise_and.reduce([(particles[:, i]-(r+dr) >= mins[i]) & (particles[:, i]+(r+dr) <= maxs[i]) for i in range(d)])
            valid_particles = particles[valid_idxs]
            
            # compute n_i(r) for valid particles.
            for particle in valid_particles:
                n = tree.query_ball_point(particle, r+dr-eps, return_length=True) - tree.query_ball_point(particle, r, return_length=True)
                g_r[r_idx] += n
            
            # normalize
            n_valid = len(valid_particles)
            shell_vol = (4/3)*np.pi*((r+dr)**3 - r**3) if d == 3 else np.pi*((r+dr)**2 - r**2)
            g_r[r_idx] /= n_valid*shell_vol*rho

            if progress:
                print('Computing RDF     Radius {}/{}    Time elapsed: {:.3f} s'.format(r_idx+1, len(radii), time.time()-start), end='\r', flush=True)

    return g_r, radii
    
# Generate sample particle coordinates

if __name__ == "__main__":
        sys.exit(main())
        
        
num_particles = 1000
coords = np.random.rand(num_particles, 3) * 10  # Coordinates in a 10x10x10 box

# Compute RDF
g_r, radii = rdf(coords, dr=0.05)

# Plot the RDF
plt.plot(radii, g_r)
plt.xlabel('Radius')
plt.ylabel('Radial Distribution Function, g(r)')
plt.title('Radial Distribution Function')
plt.show()

When the script is run, __name__ is "__main__", so main is called.

main calls rdf, with parallel defaulting to True.

rdf uses multiprocessing to call paralell_hist_loop.

The multiprocessing creates a new process that imports the script.

__name__ isn’t "__main__", so main isn’t called.

However, a few lines later, rdf is called, with parallel defaulting to True.

rdf uses multiprocessing to call paralell_hist_loop.

The multiprocessing creates a new process that imports the script.

__name__ isn’t "__main__", so main isn’t called.

However, a few lines later, rdf is called, with parallel defaulting to True.

rdf uses multiprocessing to call paralell_hist_loop.

The multiprocessing creates a new process that imports the script.

And so on…

Hi @MRAB, thank you for the explanation. Can you show me how to use it correctly in that case? From your previous reply, I thought you were suggesting I had to move that “if” statement to outside of the function, and I am not sure how to proceed.

Maybe like this:

import numpy as np
import matplotlib.pyplot as plt

import sys

import os
import time
import numpy as np
from multiprocessing import Pool
from scipy.spatial import cKDTree
from pymatgen.core.structure import Structure,Molecule,Lattice


def paralell_hist_loop(radii_and_indices, kdtree, particles, mins, maxs, N_radii, dr, eps, rho):
    """RDF histogram loop process for multiprocessing"""
    N, d = particles.shape
    g_r_partial = np.zeros(shape=(N_radii))

    for r_idx, r in radii_and_indices:
        r_idx = int(r_idx)
        # find all particles that are at least r + dr away from the edges of the box
        valid_idxs = np.bitwise_and.reduce([(particles[:, i]-(r+dr) >= mins[i]) & (particles[:, i]+(r+dr) <= maxs[i]) for i in range(d)])
        valid_particles = particles[valid_idxs]

        # compute n_i(r) for valid particles.
        for particle in valid_particles:
            n = kdtree.query_ball_point(particle, r+dr-eps, return_length=True) - kdtree.query_ball_point(particle, r, return_length=True)
            g_r_partial[r_idx] += n

        # normalize
        n_valid = len(valid_particles)
        shell_vol = (4/3)*np.pi*((r+dr)**3 - r**3) if d == 3 else np.pi*((r+dr)**2 - r**2)
        g_r_partial[r_idx] /= n_valid*shell_vol*rho

    return g_r_partial


def rdf(particles, dr, rho=None, rcutoff=0.9, eps=1e-15, parallel=True, progress=False):
    """
    Computes 2D or 3D radial distribution function g(r) of a set of particle
    coordinates of shape (N, d). Particle must be placed in a 2D or 3D cuboidal
    box of dimensions [width x height (x depth)].

    Parameters
    ----------
    particles : (N, d) np.array
        Set of particle from which to compute the radial distribution function
        g(r). Must be of shape (N, 2) or (N, 3) for 2D and 3D coordinates
        repsectively.
    dr : float
        Delta r. Determines the spacing between successive radii over which g(r)
        is computed.
    rho : float, optional
        Number density. If left as None, box dimensions will be inferred from
        the particles and the number density will be calculated accordingly.
    rcutoff : float
        radii cutoff value between 0 and 1. The default value of 0.9 means the
        independent variable (radius) over which the RDF is computed will range
        from 0 to 0.9*r_max. This removes the noise that occurs at r values
        close to r_max, due to fewer valid particles available to compute the
        RDF from at these r values.
    eps : float, optional
        Epsilon value used to find particles less than or equal to a distance
        in KDTree.
    parallel : bool, optional
        Option to enable or disable multiprocessing. Enabling this affords
        significant increases in speed.
    progress : bool, optional
        Set to False to disable progress readout (only valid when
        parallel=False).


    Returns
    -------
    g_r : (n_radii) np.array
        radial distribution function values g(r).
    radii : (n_radii) np.array
        radii over which g(r) is computed
    """

    if not isinstance(particles, np.ndarray):
        particles = np.array(particles)
    # assert particles array is correct shape
    shape_err_msg = 'particles should be an array of shape N x d, where N is \
                     the number of particles and d is the number of dimensions.'
    assert len(particles.shape) == 2, shape_err_msg
    # assert particle coords are 2 or 3 dimensional
    assert particles.shape[-1] in [2, 3], 'RDF can only be computed in 2 or 3 \
                                           dimensions.'

    start = time.time()

    mins = np.min(particles, axis=0)
    maxs = np.max(particles, axis=0)
    # translate particles such that the particle with min coords is at origin
    particles = particles - mins

    # dimensions of box
    dims = maxs - mins

    r_max = (np.min(dims) / 2)*rcutoff
    radii = np.arange(dr, r_max, dr)

    N, d = particles.shape
    if not rho:
        rho = N / np.prod(dims) # number density

    # create a KDTree for fast nearest-neighbor lookup of particles
    tree = cKDTree(particles)

    if parallel:
        N_radii = len(radii)
        radii_and_indices = np.stack([np.arange(N_radii), radii], axis=1)
        radii_splits = np.array_split(radii_and_indices, os.cpu_count(), axis=0)
        values = [(radii_splits[i], tree, particles, mins, maxs, N_radii, dr, eps, rho) for i in range(len(radii_splits))]
        with Pool() as pool:
            results = pool.starmap(paralell_hist_loop, values)
        g_r = np.sum(results, axis=0)
    else:
        g_r = np.zeros(shape=(len(radii)))
        for r_idx, r in enumerate(radii):
            # find all particles that are at least r + dr away from the edges of the box
            valid_idxs = np.bitwise_and.reduce([(particles[:, i]-(r+dr) >= mins[i]) & (particles[:, i]+(r+dr) <= maxs[i]) for i in range(d)])
            valid_particles = particles[valid_idxs]

            # compute n_i(r) for valid particles.
            for particle in valid_particles:
                n = tree.query_ball_point(particle, r+dr-eps, return_length=True) - tree.query_ball_point(particle, r, return_length=True)
                g_r[r_idx] += n

            # normalize
            n_valid = len(valid_particles)
            shell_vol = (4/3)*np.pi*((r+dr)**3 - r**3) if d == 3 else np.pi*((r+dr)**2 - r**2)
            g_r[r_idx] /= n_valid*shell_vol*rho

            if progress:
                print('Computing RDF     Radius {}/{}    Time elapsed: {:.3f} s'.format(r_idx+1, len(radii), time.time()-start), end='\r', flush=True)

    return g_r, radii


def main():
    # Generate sample particle coordinates
    num_particles = 1000
    coords = np.random.rand(num_particles, 3) * 10  # Coordinates in a 10x10x10 box

    system = Molecule.from_file(r'C:\Users\bnzmi\OneDrive - Northeastern University\Lustig Lab\RDF Attempts\hybrid7-pos-1.xyz')
    a = 11
    b = 11
    c = 11

    structure = Structure(
        Lattice([[a, 0., 0.], [0., b, 0.], [0., 0., c]]),
        species=system.species,
        coords=system.cart_coords,
        coords_are_cartesian=True
        )

    structure.make_supercell(1)
    coords = structure.cart_coords
    noise = np.random.normal(loc=0.0, scale=0.05, size=(coords.shape))
    coords = coords + noise

    # Compute RDF
    g_r, radii = rdf(coords, dr=0.05)

    # Plot the RDF
    plt.plot(radii, g_r)
    plt.xlabel('Radius')
    plt.ylabel('Radial Distribution Function, g(r)')
    plt.title('Radial Distribution Function')
    plt.show()


if __name__ == "__main__":
    main()

thank you very much! It works now!