# The usual preamble

%matplotlib inline

import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Make the graphs a bit prettier, and bigger

plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = (15, 5)

plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 

# Not necessary in pandas 0.13.

pd.set_option('display.width', 5000) 

pd.set_option('display.max_columns', 60)

One of the main problems with messy data is: how do you know if it's messy or not?

We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy.

requests = pd.read_csv('https://sciencedata.dk/public/6e3ed434c0fa43df906ce2b6d1ba9fc6/pandas-cookbook/data/311-service-requests.csv', dtype='unicode')

7.1 How do we know if it's messy?¶

We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.

To get a sense for whether a column has problems, I usually use .unique() to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.

When we look at the unique values in "Incident Zip", it quickly becomes clear that this is a mess.

Some of the problems:

Some have been parsed as strings, and some as floats
There are nans
Some of the zip codes are 29616-0759 or 83
There are some N/A values that pandas didn't recognize, like 'N/A' and 'NO CLUE'

What we can do:

Normalize 'N/A' and 'NO CLUE' into regular nan values
Look at what's up with the 83, and decide what to do
Make everything strings

requests['Incident Zip'].unique()

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209', '10021', '10037', '11413', '11375', '11238', '10473',
       '11103', '11354', '11361', '11106', '11385', '10463', '10467',
       '11204', '11237', '11377', '11364', '11434', '11435', '11210',
       '11228', '11368', '11694', '10464', '11415', '10314', '10301',
       '10018', '10038', '11105', '11230', '10468', '11104', '10471',
       '11416', '10075', '11422', '11355', '10028', '10462', '10306',
       '10461', '11224', '11429', '10035', '11366', '11362', '11206',
       '10460', '10304', '11360', '11411', '10455', '10475', '10069',
       '10303', '10308', '10302', '11357', '10470', '11367', '11370',
       '10454', '10451', '11436', '11426', '10153', '11004', '11428',
       '11427', '11001', '11363', '10004', '10474', '11430', '10000',
       '10307', '11239', '10119', '10006', '10048', '11697', '11692',
       '11693', '10573', '00083', '11559', '10020', '77056', '11776',
       '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',
       '55164-0737', '19711', '07306', '000000', 'NO CLUE', '90010',
       '11747', '23541', '11788', '07604', '10112', '11563', '11580',
       '07087', '11042', '07093', '11501', '92123', '00000', '11575',
       '07109', '11797', '10803', '11716', '11722', '11549-3650', '10162',
       '23502', '11518', '07020', '08807', '11577', '07114', '11003',
       '07201', '61702', '10103', '29616-0759', '35209-3114', '11520',
       '11735', '10129', '11005', '41042', '11590', '06901', '07208',
       '11530', '13221', '10954', '11111', '10107'], dtype=object)

7.2 Fixing the nan values and string/float confusion¶

We can pass a na_values option to pd.read_csv to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.

na_values = ['NO CLUE', 'N/A', '0']

requests = pd.read_csv('https://sciencedata.dk/public/6e3ed434c0fa43df906ce2b6d1ba9fc6/pandas-cookbook/data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})

requests['Incident Zip'].unique()

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209', '10021', '10037', '11413', '11375', '11238', '10473',
       '11103', '11354', '11361', '11106', '11385', '10463', '10467',
       '11204', '11237', '11377', '11364', '11434', '11435', '11210',
       '11228', '11368', '11694', '10464', '11415', '10314', '10301',
       '10018', '10038', '11105', '11230', '10468', '11104', '10471',
       '11416', '10075', '11422', '11355', '10028', '10462', '10306',
       '10461', '11224', '11429', '10035', '11366', '11362', '11206',
       '10460', '10304', '11360', '11411', '10455', '10475', '10069',
       '10303', '10308', '10302', '11357', '10470', '11367', '11370',
       '10454', '10451', '11436', '11426', '10153', '11004', '11428',
       '11427', '11001', '11363', '10004', '10474', '11430', '10000',
       '10307', '11239', '10119', '10006', '10048', '11697', '11692',
       '11693', '10573', '00083', '11559', '10020', '77056', '11776',
       '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',
       '55164-0737', '19711', '07306', '000000', '90010', '11747',
       '23541', '11788', '07604', '10112', '11563', '11580', '07087',
       '11042', '07093', '11501', '92123', '00000', '11575', '07109',
       '11797', '10803', '11716', '11722', '11549-3650', '10162', '23502',
       '11518', '07020', '08807', '11577', '07114', '11003', '07201',
       '61702', '10103', '29616-0759', '35209-3114', '11520', '11735',
       '10129', '11005', '41042', '11590', '06901', '07208', '11530',
       '13221', '10954', '11111', '10107'], dtype=object)

7.3 What's up with the dashes?¶

rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)

len(requests[rows_with_dashes])

5

requests[rows_with_dashes]

I thought these were missing data and originally deleted them like this:

requests['Incident Zip'][rows_with_dashes] = np.nan

But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them.

long_zip_codes = requests['Incident Zip'].str.len() > 5

requests['Incident Zip'][long_zip_codes].unique()

array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',
       '35209-3114'], dtype=object)

Those all look okay to truncate to me.

requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)

Done.

Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that.

requests[requests['Incident Zip'] == '00000']

This looks bad to me. Let's set these to nan.

zero_zips = requests['Incident Zip'] == '00000'

requests.loc[zero_zips, 'Incident Zip'] = np.nan

Great. Let's see where we are now:

unique_zips = requests['Incident Zip'].unique().astype(str)

unique_zips.sort()

unique_zips

array(['00083', '02061', '06901', '07020', '07087', '07093', '07109',
       '07114', '07201', '07208', '07306', '07604', '08807', '10000',
       '10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10009', '10010', '10011', '10012', '10013', '10014', '10016',
       '10017', '10018', '10019', '10020', '10021', '10022', '10023',
       '10024', '10025', '10026', '10027', '10028', '10029', '10030',
       '10031', '10032', '10033', '10034', '10035', '10036', '10037',
       '10038', '10039', '10040', '10044', '10048', '10065', '10069',
       '10075', '10103', '10107', '10112', '10119', '10128', '10129',
       '10153', '10162', '10280', '10281', '10282', '10301', '10302',
       '10303', '10304', '10305', '10306', '10307', '10308', '10309',
       '10310', '10312', '10314', '10451', '10452', '10453', '10454',
       '10455', '10456', '10457', '10458', '10459', '10460', '10461',
       '10462', '10463', '10464', '10465', '10466', '10467', '10468',
       '10469', '10470', '10471', '10472', '10473', '10474', '10475',
       '10573', '10803', '10954', '11001', '11003', '11004', '11005',
       '11040', '11042', '11101', '11102', '11103', '11104', '11105',
       '11106', '11109', '11111', '11201', '11203', '11204', '11205',
       '11206', '11207', '11208', '11209', '11210', '11211', '11212',
       '11213', '11214', '11215', '11216', '11217', '11218', '11219',
       '11220', '11221', '11222', '11223', '11224', '11225', '11226',
       '11228', '11229', '11230', '11231', '11232', '11233', '11234',
       '11235', '11236', '11237', '11238', '11239', '11249', '11354',
       '11355', '11356', '11357', '11358', '11360', '11361', '11362',
       '11363', '11364', '11365', '11366', '11367', '11368', '11369',
       '11370', '11372', '11373', '11374', '11375', '11377', '11378',
       '11379', '11385', '11411', '11412', '11413', '11414', '11415',
       '11416', '11417', '11418', '11419', '11420', '11421', '11422',
       '11423', '11426', '11427', '11428', '11429', '11430', '11432',
       '11433', '11434', '11435', '11436', '11501', '11518', '11520',
       '11530', '11549', '11559', '11563', '11575', '11577', '11580',
       '11590', '11691', '11692', '11693', '11694', '11697', '11716',
       '11722', '11735', '11747', '11776', '11788', '11797', '13221',
       '14225', '19711', '23502', '23541', '29616', '35209', '41042',
       '55164', '61702', '70711', '77056', '77092', '90010', '92123',
       'nan'], dtype='<U5')

Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.

Let's take a closer look:

zips = requests['Incident Zip']

# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)

is_close = zips.str.startswith('0') | zips.str.startswith('1')

# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False

is_far = ~(is_close) & zips.notnull()

zips[is_far]

12102    77056
13450    70711
29136    77092
30939    55164
44008    90010
47048    23541
57636    92123
71001    92123
71834    23502
80573    61702
85821    29616
89304    35209
94201    41042
Name: Incident Zip, dtype: object

requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort_values('Incident Zip')

Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead.

requests['City'].str.upper().value_counts()

BROOKLYN                  31662
NEW YORK                  22664
BRONX                     18438
STATEN ISLAND              4766
JAMAICA                    2246
                          ...  
EAST ROCKAWAY                 1
BOHIEMA                       1
ROSLYN                        1
PORT JEFFERSON STATION        1
STAMFORD                      1
Name: City, Length: 100, dtype: int64

It looks like these are legitimate complaints, so we'll just leave them alone.

7.4 Putting it together¶

Here's what we ended up doing to clean up our zip codes, all together:

na_values = ['NO CLUE', 'N/A', '0']

requests = pd.read_csv('https://sciencedata.dk/public/6e3ed434c0fa43df906ce2b6d1ba9fc6/pandas-cookbook/data/311-service-requests.csv', 

                       na_values=na_values, 

                       dtype={'Incident Zip': str})

def fix_zip_codes(zips):

    # Truncate everything to length 5 

    zips = zips.str.slice(0, 5)

    

    # Set 00000 zip codes to nan

    zero_zips = zips == '00000'

    zips[zero_zips] = np.nan

    

    return zips

requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])

unique_incidents_zip = requests['Incident Zip'].unique().astype(str)

unique_incidents_zip.sort()

unique_incidents_zip

array(['00083', '02061', '06901', '07020', '07087', '07093', '07109',
       '07114', '07201', '07208', '07306', '07604', '08807', '10000',
       '10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10009', '10010', '10011', '10012', '10013', '10014', '10016',
       '10017', '10018', '10019', '10020', '10021', '10022', '10023',
       '10024', '10025', '10026', '10027', '10028', '10029', '10030',
       '10031', '10032', '10033', '10034', '10035', '10036', '10037',
       '10038', '10039', '10040', '10044', '10048', '10065', '10069',
       '10075', '10103', '10107', '10112', '10119', '10128', '10129',
       '10153', '10162', '10280', '10281', '10282', '10301', '10302',
       '10303', '10304', '10305', '10306', '10307', '10308', '10309',
       '10310', '10312', '10314', '10451', '10452', '10453', '10454',
       '10455', '10456', '10457', '10458', '10459', '10460', '10461',
       '10462', '10463', '10464', '10465', '10466', '10467', '10468',
       '10469', '10470', '10471', '10472', '10473', '10474', '10475',
       '10573', '10803', '10954', '11001', '11003', '11004', '11005',
       '11040', '11042', '11101', '11102', '11103', '11104', '11105',
       '11106', '11109', '11111', '11201', '11203', '11204', '11205',
       '11206', '11207', '11208', '11209', '11210', '11211', '11212',
       '11213', '11214', '11215', '11216', '11217', '11218', '11219',
       '11220', '11221', '11222', '11223', '11224', '11225', '11226',
       '11228', '11229', '11230', '11231', '11232', '11233', '11234',
       '11235', '11236', '11237', '11238', '11239', '11249', '11354',
       '11355', '11356', '11357', '11358', '11360', '11361', '11362',
       '11363', '11364', '11365', '11366', '11367', '11368', '11369',
       '11370', '11372', '11373', '11374', '11375', '11377', '11378',
       '11379', '11385', '11411', '11412', '11413', '11414', '11415',
       '11416', '11417', '11418', '11419', '11420', '11421', '11422',
       '11423', '11426', '11427', '11428', '11429', '11430', '11432',
       '11433', '11434', '11435', '11436', '11501', '11518', '11520',
       '11530', '11549', '11559', '11563', '11575', '11577', '11580',
       '11590', '11691', '11692', '11693', '11694', '11697', '11716',
       '11722', '11735', '11747', '11776', '11788', '11797', '13221',
       '14225', '19711', '23502', '23541', '29616', '35209', '41042',
       '55164', '61702', '70711', '77056', '77092', '90010', '92123',
       'nan'], dtype='<U5')

	Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
29136	26550551	10/24/2013 06:16:34 PM	NaN	DCA	Department of Consumer Affairs	Consumer Complaint	False Advertising	NaN	77092-2016	2700 EAST SELTICE WAY	EAST SELTICE WAY	NaN	NaN	NaN	NaN	NaN	HOUSTON	NaN	NaN	Assigned	11/13/2013 11:15:20 AM	10/29/2013 11:16:16 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
30939	26548831	10/24/2013 09:35:10 AM	NaN	DCA	Department of Consumer Affairs	Consumer Complaint	Harassment	NaN	55164-0737	P.O. BOX 64437	64437	NaN	NaN	NaN	NaN	NaN	ST. PAUL	NaN	NaN	Assigned	11/13/2013 02:30:21 PM	10/29/2013 02:31:06 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
70539	26488417	10/15/2013 03:40:33 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	Street	11549-3650	365 HOFSTRA UNIVERSITY	HOFSTRA UNIVERSITY	NaN	NaN	NaN	NaN	NaN	HEMSTEAD	NaN	NaN	Assigned	11/30/2013 01:20:33 PM	10/16/2013 01:21:39 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	La Guardia Airport	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
85821	26468296	10/10/2013 12:36:43 PM	10/26/2013 01:07:07 AM	DCA	Department of Consumer Affairs	Consumer Complaint	Debt Not Owed	NaN	29616-0759	PO BOX 25759	BOX 25759	NaN	NaN	NaN	NaN	NaN	GREENVILLE	NaN	NaN	Closed	10/26/2013 09:20:28 AM	10/26/2013 01:07:07 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
89304	26461137	10/09/2013 05:23:46 PM	10/25/2013 01:06:41 AM	DCA	Department of Consumer Affairs	Consumer Complaint	Harassment	NaN	35209-3114	600 BEACON PKWY	BEACON PKWY	NaN	NaN	NaN	NaN	NaN	BIRMINGHAM	NaN	NaN	Closed	10/25/2013 02:43:42 PM	10/25/2013 01:06:41 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
42600	26529313	10/22/2013 02:51:06 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	NaN	00000	EWR EWR	EWR	NaN	NaN	NaN	NaN	NaN	NEWARK	NaN	NaN	Assigned	12/07/2013 09:53:51 AM	10/23/2013 09:54:43 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
60843	26507389	10/17/2013 05:48:44 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	Street	00000	1 NEWARK AIRPORT	NEWARK AIRPORT	NaN	NaN	NaN	NaN	NaN	NEWARK	NaN	NaN	Assigned	12/02/2013 11:59:46 AM	10/18/2013 12:01:08 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Incident Zip	Descriptor	City
71834	23502	Harassment	NORFOLK
47048	23541	Harassment	NORFOLK
85821	29616	Debt Not Owed	GREENVILLE
89304	35209	Harassment	BIRMINGHAM
94201	41042	Harassment	FLORENCE
30939	55164	Harassment	ST. PAUL
80573	61702	Billing Dispute	BLOOMIGTON
13450	70711	Contract Dispute	CLIFTON
12102	77056	Debt Not Owed	HOUSTON
29136	77092	False Advertising	HOUSTON
44008	90010	Billing Dispute	LOS ANGELES
57636	92123	Harassment	SAN DIEGO
71001	92123	Billing Dispute	SAN DIEGO