I wanted to use pickle to cache the data I loaded, but I found that when I used pickle.load to load the variables of the dump, their sha1 values changed.
You can use the following procedure to reproduce the problem. First you need to run GenerateDate.py to generate some data. Then, you need to run load.py to reproduce the problem.
In load.py, the sha1 value of the original built variable (ori) will be output, as well as the sha1 value loaded by load in the two processes (load_data, load_data_more, load_data_2). You will find that these values āāwill be somewhat different.
For example, running on my computer the result is:
rank 0, ori hash is 2a0e60672abab8ba4816d2cfeb228797fa5ae2de
rank 1, load hash is 797e95e3048c36cca070c8f9778c908ece7fc59f
rank 0, load hash is 7d48bfc9a7e4282845a5eff0cfd00e29c10818b7
rank 0, more load hash is 7d48bfc9a7e4282845a5eff0cfd00e29c10818b7
rank 1, more load hash is 7d48bfc9a7e4282845a5eff0cfd00e29c10818b7
rank 1, 2 load hash is 7d48bfc9a7e4282845a5eff0cfd00e29c10818b7
rank 0, 2 load hash is 7d48bfc9a7e4282845a5eff0cfd00e29c10818b7
When I use the sha1sum program to calculate the sha1 of the dumped file (cache.pkl), it is the same as the sha1 of ori.
I want to know whyļ¼
GenerateDate.py
import pickle
import random
from pathlib import Path
from typing import Sequence
import numpy as np
def pickle_once(frame_names: Sequence, frame_data: np.ndarray, target_path: Path):
target_path.mkdir(parents=True, exist_ok=True)
with open(target_path / 'frame_names.pkl', 'wb') as f:
pickle.dump(frame_names, f)
with open(target_path / 'frame_data.pkl', 'wb') as f:
pickle.dump(frame_data, f)
pose_root = Path(r'./pose_data')
silh_root = Path(r'./silh_data')
for i in range(100):
for j in range(3):
data_path = pose_root / f"{i:04}" / f"seq-{j}"
frame_num = random.randint(10, 100)
data = np.random.randn(frame_num, 17, 3)
frame_name = [f'{_:05}' for _ in range(frame_num)]
pickle_once(frame_name, data, data_path)
for i in range(100):
for j in range(3):
data_path = silh_root / f"{i:04}" / f"seq-{j}"
frame_num = random.randint(10, 100)
data = np.random.randn(frame_num, 64, 44)
frame_name = [f'{_:05}' for _ in range(frame_num)]
pickle_once(frame_name, data, data_path)
load.py
import multiprocessing as mp
import pickle
import re
from multiprocessing import Event
from pathlib import Path
from typing import Dict, List, Any
from hashlib import sha1
import numpy as np
class Sample:
def __init__(self, sample_id, properties) -> None:
self.id = sample_id
self.properties = properties
def __repr__(self):
return f"ID {self.id}, properties {self.properties}"
class SequenceSample(Sample):
def __init__(self, sample_id, properties, path, cache=False):
super().__init__(sample_id, properties)
self.path = path
self._data = None
self._names = None
if cache:
self._names = self.frame_names
self._data = self.__load__()
@property
def data(self):
"""
Returns:
"""
if self._data is not None:
return self._data
return self.__load__()
@property
def frame_names(self):
if self._names is None:
with open(self.path / 'frame_names.pkl', 'rb') as f:
self._names = pickle.load(f)
return self._names
def __load__(self):
with open(self.path / 'frame_data.pkl', 'rb') as f:
return pickle.load(f)
class PairSample(Sample):
def __init__(self, sample_id, properties, pose_path, silh_path, cache: bool = False):
super().__init__(sample_id, properties)
self.pose_sample = SequenceSample(sample_id, properties, pose_path, cache)
self.silh_sample = SequenceSample(sample_id, properties, silh_path, cache)
self._common_frame_name = None
self._common_frame_data = None
if cache:
self._common_frame_name = self.frame_names
self._common_frame_data = self.data
@property
def data(self):
if self._common_frame_data is not None:
return self._common_frame_data
return self.__load__()
@property
def frame_names(self):
if self._common_frame_name is not None:
return self._common_frame_name
pose_name = self.pose_sample.frame_names
silh_name = self.silh_sample.frame_names
self._common_frame_name = sorted(list(set(pose_name) & set(silh_name)))
return self._common_frame_name
def __load__(self):
pose_idx = np.asarray([self.pose_sample.frame_names.index(_) for _ in self.frame_names])
silh_idx = np.asarray([self.silh_sample.frame_names.index(_) for _ in self.frame_names])
return self.pose_sample.data[pose_idx], self.silh_sample.data[silh_idx]
class Person:
def __init__(self, person_id):
self.id = person_id
self.samples = []
def append(self, element):
self.samples.append(element)
def __getitem__(self, item):
return self.samples[item]
def __len__(self):
return len(self.samples)
def __iter__(self):
return iter(self.samples)
def __repr__(self):
return f"ID {self.id}, samples number {len(self.samples)}"
def load_without_cache(pose_root, silh_root) -> List[Person]:
if not isinstance(pose_root, Path):
pose_root = Path(pose_root)
if not isinstance(silh_root, Path):
silh_root = Path(silh_root)
people_dict: Dict[str, Person] = {}
pose_seq_paths = sorted(list(pose_root.glob("*/*")))
for pose_seq_path in pose_seq_paths:
relative_path = pose_seq_path.relative_to(pose_root)
silh_seq_path = silh_root / relative_path
if not silh_seq_path.exists():
continue
pid, properties = relative_path.parts
seq_properties = {}
for prop in properties.split('_'):
key, value = re.match(r"(\w+)-(\S+)", prop).groups()
seq_properties[key] = value
pair_sample = PairSample(pid, seq_properties, pose_seq_path, silh_seq_path)
if len(pair_sample.frame_names) == 0:
continue
if pid in people_dict:
person = people_dict[pid]
else:
person = Person(pid)
people_dict[pid] = person
person.append(pair_sample)
people = list(people_dict.values())
people = sorted(people, key=lambda x: x.id)
return people
def get_hash(data: Any):
sha = sha1()
tar_data = pickle.dumps(data)
sha.update(tar_data)
return sha.hexdigest()
def cache_to_file(cache_path, data):
with open(cache_path, "wb") as f:
pickle.dump(data, f)
f.flush()
def load_from_cache(cache_path: Path):
with open(cache_path, "rb") as f:
dataset = pickle.load(f)
return dataset
def proc_func(rank: int, event: Event):
cache_dir = Path("./cache")
cache_dir.mkdir(parents=True, exist_ok=True)
cache_path: Path = cache_dir / "cache.pkl"
if rank == 0:
dataset = load_without_cache(r'./silh_data', r'./silh_data')
print(f"rank {rank}, ori hash is {get_hash(data=dataset)}")
cache_to_file(cache_path, dataset)
event.set()
event.wait()
load_data = load_from_cache(cache_path)
print(f"rank {rank}, load hash is {get_hash(data=load_data)}")
load_data_more = load_from_cache(cache_path)
print(f"rank {rank}, more load hash is {get_hash(data=load_data_more)}")
load_data_2 = load_from_cache(cache_path)
print(f"rank {rank}, 2 load hash is {get_hash(data=load_data_2)}")
return load_data
if __name__ == '__main__':
mp.set_start_method('spawn')
barrier = Event()
print(barrier.is_set())
procs = []
for i in range(2):
proc = mp.Process(target=proc_func, args=(i, barrier))
procs.append(proc)
proc.start()
for proc in procs:
proc.join()
proc.close()