torch.set_num_threads(1)
show detailed cuda errors
https://discuss.pytorch.org/t/whats-the-meaning-of-this-error-how-can-i-debug-when-i-use-gpu/8052/3 CUDA_LAUNCH_BLOCKING make cuda report the error where it actually occurs.
Since the problem is at the cuda initialization function and does not appear on different machine I would guess that your cuda install is not working properly, you may want to reinstall it properly and test it with the cuda samples.
CUDA_LAUNCH_BLOCKING=1
How to load part of pre trained model?
partial = torch.load("path/to/model.pth", map_location=lambda storage, loc: storage)
state = model.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in partial.items() if k in state}
# 2. overwrite entries in the existing state dict
state.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(state)
https://github.com/ildoonet/pytorch-gradual-warmup-lr
pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git
from warmup_scheduler import GradualWarmupScheduler
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_epoch)
scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=8, total_epoch=10, after_scheduler=scheduler_cosine)
for epoch in range(train_epoch):
scheduler_warmup.step() # 10 epoch warmup, after that schedule as after_scheduler
...
--samples_per_plugin scalars=0
--samples_per_plugin images=1000
# default is 1000
# tensorboard/backend/application.py
# scalar_metadata.PLUGIN_NAME: 1000
https://stackoverflow.com/questions/41074688/how-do-you-read-tensorboard-files-programmatically
from tensorboard.backend.event_processing import event_accumulator
ea = event_accumulator.EventAccumulator('events.out.tfevents.x.ip-x-x-x-x',
size_guidance={ # see below regarding this argument
event_accumulator.COMPRESSED_HISTOGRAMS: 500,
event_accumulator.IMAGES: 4,
event_accumulator.AUDIO: 4,
event_accumulator.SCALARS: 0,
event_accumulator.HISTOGRAMS: 1,
})
ea.Reload() # loads events from file
>> <tensorflow.python.summary.event_accumulator.EventAccumulator at 0x7fdbe5ff59e8>
ea.Tags()
>>
{'audio': [],
'compressedHistograms': [],
'graph': True,
'histograms': [],
'images': [],
'run_metadata': [],
'scalars': ['Loss', 'Epsilon', 'Learning_rate']}
ea.Scalars('Loss')
>>
[ScalarEvent(wall_time=1481232633.080754, step=1, value=1.6365480422973633),
ScalarEvent(wall_time=1481232633.2001867, step=2, value=1.2162202596664429),
ScalarEvent(wall_time=1481232633.3877788, step=3, value=1.4660096168518066),
ScalarEvent(wall_time=1481232633.5749283, step=4, value=1.2405034303665161),
ScalarEvent(wall_time=1481232633.7419815, step=5, value=0.897326648235321),
...]
watch -n 1 nvidia-smi
torch.cuda.memory_allocated()