Data-Cleaning-With-Py/using_python_for_data_cleaning.py at main · 00BondViz/Data-Cleaning-With-Py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""Using Python For Data Cleaning.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ZWWYTVAOj2NveC_JEtCkQDow0CSxrInM
"""

# Load Packages
import pandas as pd

# Load Dataset
df = pd.read_csv("dataset.csv")

df.head()

# Column Consistency
df.columns

df.columns.str.lower()

df.columns = df.columns.str.lower()

# Renaming
df.rename(columns={'full name':'full_name','date of birth':'date_of_birth'})

df.columns

# Rename and Replace Column Names
df.rename(columns={'full name':'full_name','date of birth':'date_of_birth'},inplace=True)

df.columns

# Method 1
df.full_name

df.full_name.str.split(" ")

# Get the First Part of the Split
df.full_name.str.split(" ").str.get(0)

df['firstname'] = df.full_name.str.split(" ").str.get(0)

df.firstname

# Last Name
df['lastname'] = df.full_name.str.split(" ").str.get(1)

df.lastname

# Method 2
df.full_name

df1 = df

# Method 2 Using Expand
df1.full_name.str.split(" ",expand=True)

# Using expand and n = 1 to group all other into one column
df1.full_name.str.split(" ",n=1,expand=True)

df.head(13)

df['income.1']

# Data Type of Column
df['income.1'].dtype

# Replace with Empty
df['income.1'].str.replace("$"," ")

# Replace with Euro
df['income.1'].str.replace("$","Euro")

df.head()

df.salary

# As Boolean
df.salary.str.contains('19')

# Get the values of entire rows
df[df.salary.str.contains('19')]

# Check For Multiple Expression
df.salary.str.contains('19|17')

df.salary.str.contains('19|17',regex=True)

# Using Match to Find An Expression
df.salary.str.match('19')

df.quote

# Find the match for the word Operative
df[df.quote.str.match('Operative')]

# Finding An Index
df.salary.filter(regex='18',axis=0)

# Joining  Column
df.head()

# Method1
df.firstname + df.email

# Method 1
df.firstname +"_"+ df.email

# Method 2
dfall = df[['firstname','email']].apply("_".join,axis=1)

dfall

# Counting Strings in A Column
df.quote

# Using str.count() to count number of space
df.quote.str.count(' ') + 1

# Using str.len() to get length of words in each row of a column
df.quote.str.split().str.len()

# Using str.split().map(len) to get length of words in each row of a column
df.quote.str.split().map(len)

# Using str.split().apply(len) to get length of words in each row of a column
df.quote.str.split().apply(len)

# Get the Total Number of Counts
df.quote.str.split().apply(len).value_counts()

df.head()