-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathusing_python_for_data_cleaning.py
More file actions
134 lines (84 loc) · 2.41 KB
/
using_python_for_data_cleaning.py
File metadata and controls
134 lines (84 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""Using Python For Data Cleaning.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ZWWYTVAOj2NveC_JEtCkQDow0CSxrInM
"""
# Load Packages
import pandas as pd
# Load Dataset
df = pd.read_csv("dataset.csv")
df.head()
# Column Consistency
df.columns
df.columns.str.lower()
df.columns = df.columns.str.lower()
# Renaming
df.rename(columns={'full name':'full_name','date of birth':'date_of_birth'})
df.columns
# Rename and Replace Column Names
df.rename(columns={'full name':'full_name','date of birth':'date_of_birth'},inplace=True)
df.columns
# Method 1
df.full_name
df.full_name.str.split(" ")
# Get the First Part of the Split
df.full_name.str.split(" ").str.get(0)
df['firstname'] = df.full_name.str.split(" ").str.get(0)
df.firstname
# Last Name
df['lastname'] = df.full_name.str.split(" ").str.get(1)
df.lastname
# Method 2
df.full_name
df1 = df
# Method 2 Using Expand
df1.full_name.str.split(" ",expand=True)
# Using expand and n = 1 to group all other into one column
df1.full_name.str.split(" ",n=1,expand=True)
df.head(13)
df['income.1']
# Data Type of Column
df['income.1'].dtype
# Replace with Empty
df['income.1'].str.replace("$"," ")
# Replace with Euro
df['income.1'].str.replace("$","Euro")
df.head()
df.salary
# As Boolean
df.salary.str.contains('19')
# Get the values of entire rows
df[df.salary.str.contains('19')]
# Check For Multiple Expression
df.salary.str.contains('19|17')
df.salary.str.contains('19|17',regex=True)
# Using Match to Find An Expression
df.salary.str.match('19')
df.quote
# Find the match for the word Operative
df[df.quote.str.match('Operative')]
# Finding An Index
df.salary.filter(regex='18',axis=0)
# Joining Column
df.head()
# Method1
df.firstname + df.email
# Method 1
df.firstname +"_"+ df.email
# Method 2
dfall = df[['firstname','email']].apply("_".join,axis=1)
dfall
# Counting Strings in A Column
df.quote
# Using str.count() to count number of space
df.quote.str.count(' ') + 1
# Using str.len() to get length of words in each row of a column
df.quote.str.split().str.len()
# Using str.split().map(len) to get length of words in each row of a column
df.quote.str.split().map(len)
# Using str.split().apply(len) to get length of words in each row of a column
df.quote.str.split().apply(len)
# Get the Total Number of Counts
df.quote.str.split().apply(len).value_counts()
df.head()