cv_final/play.py at main · yuetu00/cv_final · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import numpy as np
import cv2
import time
import torch.nn.functional as F
from mss import mss
from pynput.keyboard import Controller, Key
import torch
import torch.nn as nn
from model import DJMaxModel


INPUT_SHAPE = (224, 128, 3)

KEYS = {
  0: Key.shift,
  1: 'a',
  2: 's',
  3: ';',
  4: "'",
  5: Key.shift_r
}

class Player:
    """
    Runs and loads the model, captures screenshots,
    and presses the accurate keys.
    """
    def __init__(self):
        self.keyboard = Controller()
        self.key_pressed_state = {i: False for i in range(6)}
        self.sct = mss()

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            self.device = torch.device("mps")
        else:
            self.device = torch.device("cpu")
        print(f"Using device: {self.device}")

        self.model = self.get_loaded_model("checkpoint_epoch_14_loss_0.2887_single_okey.pth")

    def get_loaded_model(self, path):
        """
        Loads the trained model from the given path.
        """
        try:
            print("Loading model...")
            model = DJMaxModel().to(self.device)
            model.load_state_dict(torch.load(path, map_location=self.device))
            model.eval()
            return model
        except Exception as e:
            print("Failed to load model:", e)
            return None

    def preprocess_frame(self, resized_frame):
        """
        Processes the frame to feed into the model.
        """
        resized_frame = resized_frame.astype(np.float32)/255.0

        frame = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0)
        # 0.299 R + 0.587 G + 0.114 B
        r = frame[:, 0, :, :]
        g = frame[:, 1, :, :]
        b = frame[:, 2, :, :]
        gray = 0.299 * r + 0.587 * g + 0.114 * b      # (T, H, W)

        frame = gray.unsqueeze(1).repeat(1, 3, 1, 1)  # (T, 3, H, W)

        frame = torch.nn.functional.interpolate(frame, size=(112, 64), mode='bilinear').to(self.device)
        return frame


    def update_keys(self, predictions):
        """
        Presses and releases keys based on model accuracy.
        """
        for lane, prob in enumerate(predictions):
            is_pressed = self.key_pressed_state[lane]

            # if is_pressed and prob < 0.5:
            #     self.keyboard.release(KEYS[lane])
            #     is_pressed = False
            # elif not is_pressed and prob >= 0.5:
            #     self.keyboard.press(KEYS[lane])
            #     is_pressed = True
            if prob < 0.6:
                self.keyboard.release(KEYS[lane])
            else:
                self.keyboard.press(KEYS[lane])

            # self.key_pressed_state[lane] = is_pressed

    def release_keys(self):
        """
        Releases all pressed keys.
        """
        for lane, is_pressed in self.key_pressed_state.items():
            if is_pressed:
                self.keyboard.release(KEYS[lane])
                self.key_pressed_state[lane] = False
        print("Keys have been released.")

    def play(self):
        """
        Play loop that captures the screen, makes prediction, and presses the keys.
        """

        print("Starting program...")
        try:
            with torch.no_grad():
                buffer = torch.zeros(1, 3, 112, 64).to(self.device)
                while True:
                    start_time = time.time()
                    screen = self.sct.grab({'top': 65, 'left': 352, 'width': 256, 'height': 448})
                    frame = np.frombuffer(screen.rgb, dtype=np.uint8).reshape((screen.height, screen.width, 3))[::2, ::2, :].copy()
                    # print(frame.shape) # 224, 128, 3
                    processed_frame = self.preprocess_frame(frame)
                    # cv2.imshow("screen", cv2.cvtColor(np.array(processed_frame.squeeze(0).permute(1,2,0).cpu()), cv2.COLOR_RGB2BGR))
                    # k = cv2.waitKey(1)
                    # shape: 1, 3, 112, 64
                    # buffer = torch.roll(buffer, shifts=-1, dims=0)
                    # buffer[-1] = processed_frame

                    # lengths = torch.tensor([len(buffer)]).to(self.device)
                    # input_window = buffer.unsqueeze(0)
                    # output = self.model(input_window, None)

                    output = self.model.inference_forward(processed_frame)
                    prediction_np = torch.sigmoid(output).cpu().numpy()[0, -1]

                    print([str(f"{i:.2f}") for i in prediction_np], f" {1/(time.time() - start_time):.2f} fps", end='       \r')
                    self.update_keys(prediction_np)

        except KeyboardInterrupt:
            print("Interrupted.")

        finally:
            self.release_keys()
            cv2.destroyAllWindows()

def main():
    player = Player()
    if player.model:
        player.play()
    else:
        print("Exiting program.")

if __name__ == "__main__":
  main()