-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMAL_R_Script.R
More file actions
72 lines (53 loc) · 2.08 KB
/
MAL_R_Script.R
File metadata and controls
72 lines (53 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Load necessary libraries
library(dplyr)
library(readr)
library(stringr)
# Load dataset (update the path)
df <- read_csv("C:/Users/khaos/Documents/UM/DSC project/mal_anime_list.csv")
# View the first few rows
head(df)
# Check dataset dimensions
cat("Dataset contains", nrow(df), "rows and", ncol(df), "columns.\n")
# View column names
colnames(df)
# Count missing values in each column
colSums(is.na(df))
# Drop rows where critical values are missing
df <- df %>%
filter(!is.na(rank) & !is.na(mean) & !is.na(popularity) & !is.na(num_episodes) & !is.na(media_type))
# Fill Missing Categorical Values
#df <- df %>%
# mutate(
# studios = ifelse(is.na(studios), "Unknown", studios),
# genres = ifelse(is.na(genres), "Unknown", genres),
# status = ifelse(is.na(status), "Unknown", status),
# source = ifelse(is.na(source), "Unknown", source),
# synopsis = ifelse(is.na(synopsis), "Unknown", synopsis)
# )
# Replace NA, Empty Strings (""), and Whitespace (" ")
df <- df %>%
mutate(
studios = ifelse(is.na(studios) | studios == "" | str_trim(studios) == "", "Unknown", studios),
genres = ifelse(is.na(genres) | genres == "" | str_trim(genres) == "", "Unknown", genres),
status = ifelse(is.na(status) | status == "" | str_trim(status) == "", "Unknown", status),
source = ifelse(is.na(source) | source == "" | str_trim(source) == "", "Unknown", source),
synopsis = ifelse(is.na(synopsis) | synopsis == "" | str_trim(synopsis) == "", "Unknown", synopsis)
)
# converting start date end date data types
df <- df %>%
mutate(start_date = as.Date(start_date, format = "%m/%d/%Y"))
# handling Missing end_date for Ongoing Anime
df <- df %>%
mutate(
end_date = as.Date(ifelse(end_date == "N/A", NA, end_date), format = "%m/%d/%Y")
)
# turn certain categorical columns into factor ()
df <- df %>%
mutate(
status = as.factor(status),
rating = as.factor(rating),
media_type = as.factor(media_type),
source = as.factor(source)
)
#checks
table(df$end_date) # Ensure "Ongoing" is present