A way to get all of the modules

My idea that suggest to make suggestions in ModuleNotFoundError was reject that no way to get all of the modules. Then I wrote the code to make it below. However, I can only test it on Windows system. How will it behave on the other system? And how to initlizate the builtins_modules?

The two issue: 134872, 134175

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
    #include <windows.h>
#else
    #include <dirent.h>
#endif
#include <sys/stat.h>

// gcc list_modules.c -DPYTHON_STD_PATH=\"/usr/lib/python3.11\" -o list_modules
// gcc list_modules.c -DPY_EXT_PREFIX=\"cpython-311-x86_64-linux-gnu\" -o list_modules

#define PY_EXT_PY ".py"

#ifdef _WIN32
    #define PY_EXT_PYD ".pyd"
#else
    #define PY_EXT_SO ".so"
#endif

// ===== builtins functions =====
const char* builtin_modules[] = {
    "_abc", "_ast", "_bisect", "_blake2", "_codecs", "_codecs_cn",
    "_codecs_hk", "_codecs_iso2022", "_codecs_jp", "_codecs_kr",
    "_codecs_tw", "_collections", "_contextvars", "_csv", "_datetime",
    "_functools", "_heapq", "_imp", "_interpchannels", "_interpqueues",
    "_interpreters", "_io", "_json", "_locale", "_lsprof", "_md5", 
    "_multibytecodec", "_opcode",  "_operator", "_pickle", "_random",
    "_sha1", "_sha2", "_sha3", "_signal", "_sre", "_stat", "_statistics",
    "_string", "_struct", "_suggestions", "_symtable", "_sysconfig", "_thread",
    "_tokenize", "_tracemalloc", "_typing", "_warnings", "_weakref", "_winapi",
    "array", "atexit", "binascii", "builtins", "cmath", "errno", "faulthandler",
    "gc", "itertools", "marshal", "math", "mmap", "msvcrt", "nt", "sys", "time",
    "winreg", "xxsubtype", "zlib",
    NULL
};

int has_suffix(const char *name, const char *suffix) {
    size_t len1 = strlen(name);
    size_t len2 = strlen(suffix);
    return len1 > len2 && strcmp(name + len1 - len2, suffix) == 0;
}

#ifdef PY_EXT_PREFIX
int match_ext_prefix(const char* filename) {
    return strstr(filename, PY_EXT_PREFIX) != NULL;
}
#endif

// manager the modules list
typedef struct {
    char **names;
    size_t count;
    size_t capacity;
} ModuleList;

ModuleList* create_module_list() {
    ModuleList *list = malloc(sizeof(ModuleList));
    if (!list) return NULL;
    
    list->capacity = 32;
    list->count = 0;
    list->names = malloc(list->capacity * sizeof(char*));
    if (!list->names) {
        free(list);
        return NULL;
    }
    
    return list;
}

void free_module_list(ModuleList *list) {
    if (!list) return;
    
    for (size_t i = 0; i < list->count; i++) {
        free(list->names[i]);
    }
    free(list->names);
    free(list);
}

int add_module_to_list(ModuleList *list, const char *name) {
    if (!list || !name) return 0;
    
    // Check if module already exists
    for (size_t i = 0; i < list->count; i++) {
        if (strcmp(list->names[i], name) == 0) {
            return 1; // Already exists
        }
    }
    
    // Check if we need to expand the array
    if (list->count >= list->capacity) {
        size_t new_capacity = list->capacity * 2;
        char **new_names = realloc(list->names, new_capacity * sizeof(char*));
        if (!new_names) return 0;
        
        list->names = new_names;
        list->capacity = new_capacity;
    }
    
    // Add the new module name (make a copy)
    char *name_copy = strdup(name);
    if (!name_copy) return 0;
    
    list->names[list->count++] = name_copy;
    return 1;
}

void print_module_list(const ModuleList *list) {
    if (!list) return;
    
    for (size_t i = 0; i < list->count; i++) {
        printf("%s\n", list->names[i]);
    }
}

void add_module_name(const char* filename, ModuleList *list) {
    if (!add_module_to_list(list, filename)) {
        fprintf(stderr, "Failed to add module: %s\n", filename);
    }
}

void scan_dir(const char* path, ModuleList *list) {
#ifdef _WIN32
    WIN32_FIND_DATA findFileData;
    HANDLE hFind;
    char searchPath[MAX_PATH];
    
    snprintf(searchPath, sizeof(searchPath), "%s\\*", path);
    hFind = FindFirstFile(searchPath, &findFileData);
    
    if (hFind == INVALID_HANDLE_VALUE) return;
    
    do {
        if (strcmp(findFileData.cFileName, ".") == 0 || 
            strcmp(findFileData.cFileName, "..") == 0) continue;
            
        char fullpath[MAX_PATH];
        snprintf(fullpath, sizeof(fullpath), "%s\\%s", path, findFileData.cFileName);
        
        if (findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
            char initfile[MAX_PATH];
            snprintf(initfile, sizeof(initfile), "%s\\__init__.py", fullpath);
            WIN32_FILE_ATTRIBUTE_DATA fileData;
            if (GetFileAttributesEx(initfile, GetFileExInfoStandard, &fileData)) {
                add_module_name(findFileData.cFileName, list);
            }
        } else {
            if (has_suffix(findFileData.cFileName, PY_EXT_PY)) {
                size_t len = strlen(findFileData.cFileName) - strlen(PY_EXT_PY);
                char modname[256];
                strncpy(modname, findFileData.cFileName, len);
                modname[len] = '\0';
                add_module_name(modname, list);
            }
#ifdef PY_EXT_PREFIX
            else if (has_suffix(findFileData.cFileName, PY_EXT_PYD) && match_ext_prefix(findFileData.cFileName)) {
                size_t len = strlen(findFileData.cFileName) - strlen(PY_EXT_PYD);
                char modname[256];
                strncpy(modname, findFileData.cFileName, len);
                modname[len] = '\0';
                add_module_name(modname, list);
            }
#endif
        }
    } while (FindNextFile(hFind, &findFileData));
    
    FindClose(hFind);
#else
    DIR *dir = opendir(path);
    if (!dir) return;
    struct dirent *entry;
    while ((entry = readdir(dir)) != NULL) {
        if (entry->d_name[0] == '.') continue;
        char fullpath[1024];
        snprintf(fullpath, sizeof(fullpath), "%s/%s", path, entry->d_name);
        
        struct stat st;
        if (stat(fullpath, &st) == -1) continue;

        if (S_ISREG(st.st_mode)) {
            // .py
            if (has_suffix(entry->d_name, PY_EXT_PY)) {
                size_t len = strlen(entry->d_name) - strlen(PY_EXT_PY);
                char modname[256];
                strncpy(modname, entry->d_name, len);
                modname[len] = '\0';
                add_module_name(modname, list);
            }
#ifdef PY_EXT_PREFIX
    #ifdef _WIN32
            else if (has_suffix(entry->d_name, PY_EXT_PYD) && match_ext_prefix(entry->d_name)) {
                size_t len = strlen(entry->d_name) - strlen(PY_EXT_PYD);
                char modname[256];
                strncpy(modname, entry->d_name, len);
                modname[len] = '\0';
                add_module_name(modname, list);
            }
    #else
            else if (has_suffix(entry->d_name, PY_EXT_SO) && match_ext_prefix(entry->d_name)) {
                const char* dot = strchr(entry->d_name, '.');
                if (dot) {
                    size_t len = dot - entry->d_name;
                    char modname[256];
                    strncpy(modname, entry->d_name, len);
                    modname[len] = '\0';
                    add_module_name(modname, list);
                }
            }
    #endif
#endif
        } else if (S_ISDIR(st.st_mode)) {
            char initfile[1024];
            snprintf(initfile, sizeof(initfile), "%s/__init__.py", fullpath);
            if (stat(initfile, &st) == 0) {
                add_module_name(entry->d_name, list);
            }
        }
    }
    closedir(dir);
#endif
}

ModuleList* find_all_packages() {
    ModuleList *list = create_module_list();
    if (!list) return NULL;

    // 1. First scan current directory
    char current_dir[1024];
    if (getcwd(current_dir, sizeof(current_dir))) {
        scan_dir(current_dir, list);
    }

    // 2. Add builtin modules
    for (int i = 0; builtin_modules[i] != NULL; i++) {
        if (!add_module_to_list(list, builtin_modules[i])) {
            fprintf(stderr, "Failed to add builtin module: %s\n", builtin_modules[i]);
        }
    }

    // 3. Scan standard library path
#ifdef PYTHON_STD_PATH
    scan_dir(PYTHON_STD_PATH, list);
#endif

    // 4. Scan site-packages for third-party modules
#ifdef PYTHON_SITE_PATH
    scan_dir(PYTHON_SITE_PATH, list);
#endif

    return list;
}

int main() {
    ModuleList *modules = find_all_packages();
    if (!modules) {
        fprintf(stderr, "Failed to create module list\n");
        return 1;
    }

    printf("Found %zu Python modules/packages:\n", modules->count);
    print_module_list(modules);
    free_module_list(modules);
    return 0;
}
1 Like

You can get built in names from here.

1 Like

To be clear: I didn’t reject the feature because it’s impossible to get all modules. I rejected it because:

  • we need to build a cache for the modules that exist, including those in site-packages, not just the built-in ones. Giving suggestions for only the built-in ones would work but this is a half-feature to me and I prefer not having a half-feature especially if its implementation cannot be easily extended (with sys.builtin_module_names or sys.stdlib_module_namesare built with Python so we already have the cache).
  • this cache needs to be maintained in importlib or at the interpreter’s level in C and would be used every time we import a module. In particular, there may be some operations that can’t be done (like importing the discovered packages while doing the bootstrapping)

In addition, and this is my fault here, I would have wanted a d.p.o thread first for such feature request, as well as (and this is something I asked on the issue), a proof-of-concept showing that imports performance and memory usage are not affected by the cache construction or the modules discovery.

2 Likes

Well, it cause the circle because that it needs module sys. But I think that there may be a way to ensure all of the builtins modules in compile. Anyway I will try to build the code with cpython, realize the feature and analyse the speed.

I had a try here. Now that the change does not affact the speed neither normal nor abnormal. The only problem is that the suggestion sometimes is unstable. For example, when import ant, the suggestion may be ast, nt or aqt on my computer, so I am trying to fix it.

I agree with this rationale. I would add that suggesting an alternative module name might be more unhelpful in some cases. For example import ant might actually want to import the ant project from PyPI. Suggesting an alternative module name would be the wrong behaviour, as the problem is that the package is not installed in site-packages / the active virtual environment.

A

Well, I think that the reason might be too harsh to the suggestion. Any auto behavior to suggest almost must make the mistake. For example, I define function randon_xxx whose name is wrong, and use random_xxx, the python suggest to use randon_xxx which can lead to that the other members cannot understand. The suggestions can only mean one possible way, and which way is right depends on the real condition.

And suggest for “pip install xxx“ is dangerous. For examlpe, there is a module normal_module, a hacker made a module normol_module, one user installed the normal_module but spelling to normol_module. The suggest that “pip install normol_module” resulted in that the users’private informations went to hacker.

See here. Now the suggestion of the name is stable. Here is the test of “test.py” in the storage(what add is: ("import abd", ModuleNotFoundError, "no module named 'abd'. Did you mean 'abc'?")):

No. number of entries used time(/s) average time(/s) result
1 5 0.093 0.019 success
2 5 0.087 0.017 success
3 5 0.079 0.016 success
4 5 0.078 0.016 success
5 5 0.089 0.018 success
6 6 0.086 0.014 success
7 6 0.080 0.013 success
8 6 0.086 0.014 success
9 6 0.089 0.015 success
10 6 0.076 0.013 success

See the column, we found that the number of entries were almost no effect. So the most of time cost by IDLE and the start of unittest, and the time that found the most likely module is nearly null.

Testing the speed of the find_all_packages.scan_dir("path/to/site-packages"), I got the data below:

Tools number average time(/ms)
timeit 1000 5.795

The len of the site-packages on my computer is 236. So that I can assert that before the number of the packages effect the function to scan the site-packages, the devices have tried to forbidden to install anything due to the capacity of the storage. The cache is useless.

More about why cache is useless

The reason of “building a cache” assume that the suggestion of the “ModuleNotFoundError” will effect the normal import. However, before the error diffuse, the function that suggest the name in traceback has not run, so the normal import is not effected. And consider about the normal code structure:

#1. import module
import module1
import module2
from module3 import name1
from module4.child_module import name2

#2. define the variable
a = 1
def b():
    ...
class C:
    def __init__(self):
        ...
    ...

def main():
    ...

#3. running the script
if __name__ == "__main__":
    main()

The ModuleNotFoundError often appears at the stage of import module, so there is no confuse that “How it cost when suggest the name”, the main module has stopped. The only space is the delay import, but as I has said, if you caught the exception, the suggest wouldn’t run. Hadn’t catch, the effect was also small and the script has been needed to stop.

The other

If the user is forget to install the packages, is the suggest helpless?

Well, this is the question for all of the auto suggestion: there must be some suggestions that is wrong. For example, I made a mistake in spelling in the define but spelt it right when use, the NameError won’t suggest that change the name in defination, can only suggest you to use the wrong name.
So, is it possible to suggest the users to “pip install xxx”? No! That is DANGEROUS. This is helping the hackers to damage users due to that wrong spelling is often with Homograph Attack.

Thanks

@aroberge

1 Like

I think you suggested this before. Did things change meanwhile?

For example, the import of module.child_module didn’t change in 3.14. However, I changed the format to “module ‘module’ has no child module ‘child_module’” to better to show the level.

And now I had made all of what suggested in that suggestion. There is almost no change.

If you want to improve the suggestions in the REPL, you can open pull requests against Python and we’ll review them.

If I said that the change now is not only REPL, what’s your reaction?
However, now I only test it on windows, and the way to package it is “nanobind”. So what to do to combine it with cpython? If somebody suggests, I can’t express thanks enough.

Now that there are many ideas from different developer, and some are comflict. Now I want a exactly one final idea.
Now the test in 3.15 on my computer here (seven entries):

The developer @picnixz said that there is a problem for cache. However, I found that that was not. Now here is the implemention of it, and data here. Now which data I need to select now?

Now that the change can pass the python test and my test.

See here. Now I make my implementation for it as a third-party-package, and I need some feedback:

First, create a virtual environment for test.
Second, run path/to/virtual_environment/python -m pip install friendly_module_not_found_error.
Third, run the python.exe in virtual environment and give me the feedback.

I want to see how the users think and decide that whether to insist it as a python’s feature in the future.

(post deleted by author)

1 Like

I didn’t even click the link. I like the stack overflow policy of posts should be reasonably self explanatory without needing to follow links (that break)

1 Like

Thanks. I will give the text here and then change “README.md” there.
The example:

import xxx.yyy.zzz

If “xxx” not exist, the message is:
“No module named ‘xxx’”
If “xxx” exist but “yyy” not exist, the message is:
“module ‘xxx’ has no child module ‘yyy’”
Then the message add like the text below:
The final name will be compared to all module at that path. If at the top, it first compared with stdlib and then compared with the path in sys.path. Or, if the module before is not a package and the now module not exist, the message will add “module ‘
’ is not a package”. For the non-package module, it won’t support for this condition: module has a child module, and it has child module. For package, it will scan the attribute __path__ to get all possible child module to compare.

The change can clearly show the specific error in import and give the near name suggestion. For example, the original is “No module named ‘xxx.yyy.zzz’”, we cannot get message that which step is wrong, now we can see which step is wrong:
“No module named ‘xxx’” means the top, “module ‘xxx’ has no child module ‘yyy’” means the second, and ''module ‘xxx.yyy’ has no child module ‘zzz’" means the third, and so on. And like NameError and AttributeError, it will suggest the possible name.

How does your proposal handle import hooks, where what is available for import can be calculated dynamically?