Skip to content

Fix Imports & Refactoring of test_gpu script #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import argparse, os, sys, datetime, glob, importlib, csv
import argparse, os, sys, datetime, glob
import numpy as np
import time
import torch
Expand All @@ -7,13 +7,13 @@

from packaging import version
from omegaconf import OmegaConf
from torch.utils.data import random_split, DataLoader, Dataset, Subset
from torch.utils.data import DataLoader, Dataset
from functools import partial
from PIL import Image

from pytorch_lightning import seed_everything
from pytorch_lightning.trainer import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities.distributed import rank_zero_only
from pytorch_lightning.utilities import rank_zero_info

Expand Down
1 change: 0 additions & 1 deletion scripts/checker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import glob
import subprocess
import time
import fire

Expand Down
2 changes: 1 addition & 1 deletion scripts/img2img.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""make variations of input image"""

import argparse, os, sys, glob
import argparse, os
import PIL
import torch
import numpy as np
Expand Down
2 changes: 1 addition & 1 deletion scripts/inpaint.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import argparse, os, sys, glob
import argparse, os, glob
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm
Expand Down
2 changes: 1 addition & 1 deletion scripts/inpaint_sd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import argparse, os, sys, glob
import argparse, os, glob
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm
Expand Down
6 changes: 0 additions & 6 deletions scripts/logging_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@
import numpy as np
from omegaconf import OmegaConf
import streamlit as st
from streamlit import caching
from PIL import Image
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities.distributed import rank_zero_only
from tqdm import tqdm
import datetime

Expand Down
1 change: 0 additions & 1 deletion scripts/mnist-distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import torch
import torch.nn as nn
import torch.distributed as dist
from apex.parallel import DistributedDataParallel as DDP
from apex import amp


Expand Down
20 changes: 16 additions & 4 deletions scripts/test_gpu.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import socket
try:


def main() -> int:
import torch
n_gpus = torch.cuda.device_count()
print(f"checking {n_gpus} gpus.")
Expand All @@ -16,7 +18,10 @@
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()
except RuntimeError as err:
return 1


def runtime_error_case() -> None:
import requests
import datetime
import os
Expand All @@ -26,5 +31,12 @@
resp = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
print(f'ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}', flush=True)
raise err
else:
print(f"checked {socket.gethostname()}")


if __name__ == '__main__':
try:
main()
except RuntimeError as err:
runtime_error_case()
else:
print(f"checked {socket.gethostname()}")