From 25156c83698bcae174cc72c92dac825fbb4c7110 Mon Sep 17 00:00:00 2001 From: Sally Steuterman Date: Fri, 1 Mar 2024 18:04:41 -0600 Subject: [PATCH 1/4] Rough draft --- content/cleaning-pandas/_index.md | 21 +++ content/cleaning-pandas/exercises/_index.md | 22 ++++ content/cleaning-pandas/next-steps.md | 8 ++ content/cleaning-pandas/reading/_index.md | 10 ++ .../reading/inconsistent-data/_index.md | 31 +++++ .../reading/introduction/_index.md | 18 +++ .../reading/irregular-data/_index.md | 43 ++++++ .../reading/missing-data/_index.md | 122 ++++++++++++++++++ .../reading/unnecessary-data/_index.md | 25 ++++ content/cleaning-pandas/studio/_index.md | 26 ++++ 10 files changed, 326 insertions(+) create mode 100644 content/cleaning-pandas/_index.md create mode 100644 content/cleaning-pandas/exercises/_index.md create mode 100644 content/cleaning-pandas/next-steps.md create mode 100644 content/cleaning-pandas/reading/_index.md create mode 100644 content/cleaning-pandas/reading/inconsistent-data/_index.md create mode 100644 content/cleaning-pandas/reading/introduction/_index.md create mode 100644 content/cleaning-pandas/reading/irregular-data/_index.md create mode 100644 content/cleaning-pandas/reading/missing-data/_index.md create mode 100644 content/cleaning-pandas/reading/unnecessary-data/_index.md create mode 100644 content/cleaning-pandas/studio/_index.md diff --git a/content/cleaning-pandas/_index.md b/content/cleaning-pandas/_index.md new file mode 100644 index 00000000..9ef8add5 --- /dev/null +++ b/content/cleaning-pandas/_index.md @@ -0,0 +1,21 @@ ++++ +pre = "15. " +chapter = true +title = "Cleaning Data with Pandas" +date = 2024-02-27T13:59:58-06:00 +draft = false +weight = 15 ++++ + +## Learning Objectives + +Upon completing all the content in this chapter, you should be able to do the following: + +1. Use Pandas to locate different types of dirty data. +1. Use Pandas to resolve different types of dirty data. + +## Key Terminology + +## Content Links + +{{% children %}} diff --git a/content/cleaning-pandas/exercises/_index.md b/content/cleaning-pandas/exercises/_index.md new file mode 100644 index 00000000..0805e4c7 --- /dev/null +++ b/content/cleaning-pandas/exercises/_index.md @@ -0,0 +1,22 @@ ++++ +title = "Exercises: Cleaning Data with Pandas" +date = 2021-10-01T09:28:27-05:00 +draft = false +weight = 2 ++++ + +## Getting Started + +Fork this [GitHub repository](https://github.com/launchcodeeducation/cleaning-data/blob/main/Cleaning%20Data%20Exercises.ipynb) and clone to your computer. + +## Code Along + +1. Select one of the following data sets to work with. [Women's E-commerce Clothing Reviews](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews). +1. Download the dataset and add it to your Jupyter Notebook. +1. Work through the exercise notebook using the dataset. + +## Submitting Your Work + +When finished make sure to push your changes up to GitHub. + +Copy the link to your GitHub repository and paste it into the submission box in Canvas for **Exercises: Cleaning Data** and click *Submit*. diff --git a/content/cleaning-pandas/next-steps.md b/content/cleaning-pandas/next-steps.md new file mode 100644 index 00000000..84511e46 --- /dev/null +++ b/content/cleaning-pandas/next-steps.md @@ -0,0 +1,8 @@ ++++ +title = "Next Steps" +date = 2021-10-01T09:28:27-05:00 +draft = false +weight = 4 ++++ + +## Next Steps diff --git a/content/cleaning-pandas/reading/_index.md b/content/cleaning-pandas/reading/_index.md new file mode 100644 index 00000000..c4fd1ab4 --- /dev/null +++ b/content/cleaning-pandas/reading/_index.md @@ -0,0 +1,10 @@ ++++ +title = "Reading" +date = 2024-02-27T13:59:58-06:00 +draft = false +weight = 1 ++++ + +## Reading Content + +{{% children %}} diff --git a/content/cleaning-pandas/reading/inconsistent-data/_index.md b/content/cleaning-pandas/reading/inconsistent-data/_index.md new file mode 100644 index 00000000..022a560f --- /dev/null +++ b/content/cleaning-pandas/reading/inconsistent-data/_index.md @@ -0,0 +1,31 @@ ++++ +title = "Handling Inconsistent Data" +draft = false +weight = 5 ++++ + +Inconsistent data is data that is not properly formatted for the analysis. + +```console + Seller_Id Seller Sales Total_Rating Current_Items Star_Seller +0 8967 Orchid Jewels 17,896 4.5 22 0 +1 908764 Ducky Ducks 5,478 3.8 10 True +2 7463529 Candy Yarns 89,974 4.8 18 True +3 161729 Parks Pins 6,897 4.9 87 True +4 4217 Sierra's Stationary 112,988 4.3 347 0 +5 21378 Star Stitchery 53,483 4.2 52 0 +``` + +If we take a look at `Star_Seller`, we can see that all the star sellers are labeled with `True` and those who aren't star sellers have `0`. + +We can choose either to convert everything either to booleans or numbers. + +Booleans + +Change string 0 to False +Convert + +Numbers + +Change string True to 1 +Convert \ No newline at end of file diff --git a/content/cleaning-pandas/reading/introduction/_index.md b/content/cleaning-pandas/reading/introduction/_index.md new file mode 100644 index 00000000..d83c0fb0 --- /dev/null +++ b/content/cleaning-pandas/reading/introduction/_index.md @@ -0,0 +1,18 @@ ++++ +title = "Revisiting Cleaning Data" +draft = false +weight = 1 ++++ + +We have already encountered cleaning data, so we should revisit the topic quicky. + +As we discussed in the previous chapter on cleaning data, we need to clean our data to ensure that our analysis is accurate. If we want to perform a regression analysis later on, the analysis will use all the existing data points. By cleaning our data first, we can ensure that the data points being used in the analysis are what we need. + +As we previously covered, there are four types of dirty data: + +1. missing data +1. irregular data +1. unnecessary data +1. inconsistent data + +Pandas has different ways of handling these types. Because pandas is so powerful, we can easily search for dirty data and handle it. Let's examine each dirty data type and how we can clean it in pandas. diff --git a/content/cleaning-pandas/reading/irregular-data/_index.md b/content/cleaning-pandas/reading/irregular-data/_index.md new file mode 100644 index 00000000..86548f7d --- /dev/null +++ b/content/cleaning-pandas/reading/irregular-data/_index.md @@ -0,0 +1,43 @@ ++++ +title = "Handling Irregular Data" +draft = false +weight = 3 ++++ + +Irregular data is a term that commonly applies to outliers. + +Let's revisit `etsy_sellers`. + +```console + Seller Sales Total_Rating Current_Items +0 Orchid Jewels 17,896 4.5 22 +1 Ducky Ducks 5,478 3.8 10 +2 Candy Yarns 89,974 4.8 18 +3 Parks Pins 6,897 4.9 87 +4 Sierra's Stationary 112,988 6.7 347 +5 Star Stitchery 53,483 4.2 52 +``` + +Because this dataframe is so small, you might be able to spot some data points that look like outliers, so let's dive in and check out how we can investigate outliers and handle their prescence in our dataset. + +## Descriptive Statistics + +We have used descriptive statistics a lot so far, but it really is a data analyst's bread and butter! We might notice that the max and min of the `Sales` are pretty far apart, but since Etsy hosts all sorts of sellers from well-established ones to new businesses so it isn't out of the realm of possiblity that all those numbers are actually appropriate. + +However, when we use the `descibe()` function, we might notice that the max is 6.7 which is an outlier. The highest number of stars a shop can have on Etsy is 5 so something is up here and we need to investigate. + +## Visualizing Outliers + +The two most common visualization types for locating outliers are histograms and scatterplots. + +```python +etsy_sellers.plot.scatter(x="Seller",y="Total_Rating") +``` + +In this case, while we actually would find it more helpful to do a histogram. + +```python +etsy_sellers.plot.hist(column="Total_Rating") +``` + +While one plot visualization helps highlight outliers better than others, it entirely depends on what you are working on. \ No newline at end of file diff --git a/content/cleaning-pandas/reading/missing-data/_index.md b/content/cleaning-pandas/reading/missing-data/_index.md new file mode 100644 index 00000000..93d0d674 --- /dev/null +++ b/content/cleaning-pandas/reading/missing-data/_index.md @@ -0,0 +1,122 @@ ++++ +title = "Handling Missing Data" +draft = false +weight = 2 ++++ + +What is missing data? + +Missing data is when a value for either a row or column is not filled in. pandas has different data types to describe missing data. `None` and `NaN` both represent missing values, however, the two are not actually equivalent and the boolean expression `None == nan` evaluates to `False`. This is because `None` is a Python object and `NaN` is a floating point value. While pandas has many built-in ways to handle missing data that treat these two data types as interchangeable, when working on your analysis, you may have to code a custom solution. + +{{% notice blue Note %}} + +pandas has even more types to represent a missing value, such as a data type to represent a missing datetime value. For now, we will focus on `None` and `NaN`. + +{{% /notice %}} + +pandas is so intelligent that it can account for missing values when doing summary statistics, so we cannot use summary statistics to start to detect our missing values. We need to use built-in functionality to locate these values and handle them. pandas comes with a built-in function called `isna()` to help us here. + +{{% notice blue Note %}} + +pandas also has a function called `isnull()` which is an alias for `isna()`. You may see this one used frequently online so keep an eye out! + +{{% /notice %}} + +`isna()` can be run on either a series or a dataframe. Let's first take a look at how this could be used for a series. + +```python {linenos=table} +my_series = pd.Series([1,2,np.nan,4,np.nan]) +my_series.isna() +``` + +**Console Output** + +```console +0 False +1 False +2 True +3 False +4 True +``` + +When you use `isna()` on a series, you get a series in return except each value is either `True` or `False` depending on whether the value in the series was missing or not. + +You will get a similar outcome with a dataframe when locating missing values. `isna()` returns a dataaframe filled with `True` or `False` depending on whether a value was missing. Now that we have located the missing data, we need to handle it. + +Depending on what data is missing or why, you can either replace it, remove rows or columns, or further uncover the potential impact of the missing data through interpolation. + +## Removing Rows or Columns with Missing Data + +This is possible the simplest option to start with. To remove a column or row that contains missing data, pandas comes with the `dropna()` function. + +Given the following dataframe, called `etsy_sellers`. + +```console + Seller Sales Total_Rating Current_Items +0 Orchid Jewels 17,896 4.5 22 +1 Ducky Ducks 5,478 3.8 10 +2 Candy Yarns 89,974 4.8 18 +3 Parks Pins 6,897 4.9 87 +4 Sierra's Stationary 112,988 4.3 347 +5 Star Stitchery 53,483 4.2 52 +``` + +```console + Seller Sales Total_Rating Current_Items +0 Orchid Jewels 17,896 4.5 22 +1 Ducky Ducks 5,478 NaN 10 +2 Candy Yarns 89,974 4.8 18 +3 Parks Pins NaN 4.9 NaN +4 Sierra's Stationary 112,988 4.3 347 +5 Star Stitchery 53,483 4.2 52 +6 NaN NaN NaN NaN +``` + +This dataframe has several missing data points. Let's first examine row 6, which is entirely blank. Assuming this dataset came directly from Etsy, that may indicate a shop in their records that no longer exists. If we are studying currently active Etsy sellers, then we don't need this data so we can drop the whole row. `dropna()` removes all rows that have a missing value, so just runnning `dropna()` would remove rows 1 and 3 in addition to row 6. pandas functions come with so many different options and with every pandas function, we encourage you to always double check the documentation before continuing. + +```python +etsy_sellers.dropna(how="all") +``` + +The above code would drop just row 6 because it is the only row with all null values. `dropna()` defaults to dropping rows, but by changing one parameter we could specify that it should drop any column that contains missing values. + +```python +etsy_sellers.dropna(axis="columns") +``` + +{{% notice blue Note %}} + +For more options with `dropna()`, check out the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html) + +{{% /notice %}} + +## Replacing Missing Values + +```console + Seller Sales Total_Rating Current_Items +0 Orchid Jewels 17,896 4.5 22 +1 Ducky Ducks 5,478 NaN 10 +2 Candy Yarns 89,974 4.8 18 +3 Parks Pins NaN 4.9 NaN +4 Sierra's Stationary 112,988 4.3 347 +5 Star Stitchery 53,483 4.2 52 +``` + +Now that we removed row 6, we might not want to drop any more columns and/or rows. We can then look at replacing the missing values. Whether or not this is a wise decision, depends entirely on the situation at hand. Items can be missing for any number of reasons, so before replacing a missing value, you should look into why that item is missing. In the case of `etsy_sellers`, we dove in and discovered that if a shop is currently on a break, then the system returns `NaN` for the number of current items. Parks Pins is currently on a break so none of the items on their shop are actually available for sale. In the case of our analysis, we then decide to replace all the missing values with 0. In our hypothetical situation, when a shop sells out of their items, their shop is put on a break until they add new items so 0 makes logical sense to replace our missing values with. + +pandas comes with a function called `fillna()` that will help us do this. If we run the following code, we would have a problem though. + +```python +etsy_sellers.fillna(0) +``` + +This code would acutally replace every single missing value in the dataframe with 0. But we decided to be a little more intentional and want to just replace the missing values in the `Current_Items` column. + +```python +values = {"Current_Items": 0} +etsy_sellers.fillna(value=values) +``` + +We can specify using a dictionary what column we want to fill and with what we want to fill it. This gives us so much more flexibility! + +Because pandas can account for missing values, we can also interpolate what trends are in the data beyond the missing values. \ No newline at end of file diff --git a/content/cleaning-pandas/reading/unnecessary-data/_index.md b/content/cleaning-pandas/reading/unnecessary-data/_index.md new file mode 100644 index 00000000..e767636c --- /dev/null +++ b/content/cleaning-pandas/reading/unnecessary-data/_index.md @@ -0,0 +1,25 @@ ++++ +title = "Handling Unnecessary Data" +draft = false +weight = 4 ++++ + +Unnecessary data is data that is not vital to your analysis. + +```console + Seller_Id Seller Owner_Name Sales Total_Rating Current_Items +0 8967 Orchid Jewels Orchid Smith 17,896 4.5 22 +1 908764 Ducky Ducks Nala Blake 5,478 3.8 10 +2 7463529 Candy Yarns Candy Elsbeth 89,974 4.8 18 +3 161729 Parks Pins Jade Slate 6,897 4.9 87 +4 4217 Sierra's Stationary Sierra Tomlin 112,988 4.3 347 +5 21378 Star Stitchery Sara George 53,483 4.2 52 +``` + +The `Seller_Id` column is a unique identifier given to each seller by Etsy's system and now we have the `Owner_Name` as well. While we want to understand these sellers' sales number, we don't really need their name. + +```python +etsy_sellers.drop(columns=['Owner_Name']) +``` + +`drop()` defaults to dropping rows so if we want to drop a column we have to pass the column name to `columns`. \ No newline at end of file diff --git a/content/cleaning-pandas/studio/_index.md b/content/cleaning-pandas/studio/_index.md new file mode 100644 index 00000000..567c97b7 --- /dev/null +++ b/content/cleaning-pandas/studio/_index.md @@ -0,0 +1,26 @@ ++++ +title = "Studio: Cleaning Data with Pandas" +date = 2021-10-01T09:28:27-05:00 +draft = false +weight = 3 ++++ + +## Getting Started + +For this weeks studio you will be working with a partner. + +Each of you should fork this [GitHub repository](https://github.com/launchcodeeducation/DataCleaning-Pumpkin-Sales) and clone to your computer. + +## In Your Notebook + +The notebook poses questions for partners to discuss and answer about data cleaning. + +Code has been provided so that discussion can be the primary focus of the studio. + +Spaces for notes and answers to questions provided in the notebook. + +## Submitting Your Work + +When finished make sure to push your changes up to GitHub. + +Copy the link to your GitHub repository and paste it into the submission box in Canvas for **Studio: Cleaning Data** and click *Submit*. From f1726565ecaa0104a52b7942290af2e1e160e7ef Mon Sep 17 00:00:00 2001 From: Sally Steuterman Date: Tue, 5 Mar 2024 18:27:09 -0600 Subject: [PATCH 2/4] Final draft of chapter on cleaning data with pandas --- content/cleaning-pandas/_index.md | 9 +- content/cleaning-pandas/exercises/_index.md | 6 +- content/cleaning-pandas/next-steps.md | 7 +- .../reading/inconsistent-data/_index.md | 60 +++++++++++-- .../reading/introduction/_index.md | 6 +- .../reading/irregular-data/_index.md | 32 ++++--- .../reading/missing-data/_index.md | 88 ++++++++++++------- .../reading/unnecessary-data/_index.md | 36 +++++++- content/cleaning-pandas/studio/_index.md | 3 +- 9 files changed, 181 insertions(+), 66 deletions(-) diff --git a/content/cleaning-pandas/_index.md b/content/cleaning-pandas/_index.md index 9ef8add5..30a24f7e 100644 --- a/content/cleaning-pandas/_index.md +++ b/content/cleaning-pandas/_index.md @@ -11,11 +11,16 @@ weight = 15 Upon completing all the content in this chapter, you should be able to do the following: -1. Use Pandas to locate different types of dirty data. -1. Use Pandas to resolve different types of dirty data. +1. Use Pandas to locate and resolve issues related to all four types of dirty data: missing data, irregular data, unnecessary data, and inconsistent data. ## Key Terminology +These are the key terms for this chapter broken down by the page the term first appears on. Make note of these terms and their definitions as you read. + +### Handling Missing Data + +1. interpolation + ## Content Links {{% children %}} diff --git a/content/cleaning-pandas/exercises/_index.md b/content/cleaning-pandas/exercises/_index.md index 0805e4c7..b310b27f 100644 --- a/content/cleaning-pandas/exercises/_index.md +++ b/content/cleaning-pandas/exercises/_index.md @@ -7,12 +7,12 @@ weight = 2 ## Getting Started -Fork this [GitHub repository](https://github.com/launchcodeeducation/cleaning-data/blob/main/Cleaning%20Data%20Exercises.ipynb) and clone to your computer. +Open up `data-analysis-projects/cleaning-data-with-pandas/exercises/PandasCleaningTechniques.ipynb`. ## Code Along -1. Select one of the following data sets to work with. [Women's E-commerce Clothing Reviews](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews). -1. Download the dataset and add it to your Jupyter Notebook. +1. Download [Women's E-commerce Clothing Reviews Dataset](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews). +1. Add it to your Jupyter Notebook. 1. Work through the exercise notebook using the dataset. ## Submitting Your Work diff --git a/content/cleaning-pandas/next-steps.md b/content/cleaning-pandas/next-steps.md index 84511e46..299f1149 100644 --- a/content/cleaning-pandas/next-steps.md +++ b/content/cleaning-pandas/next-steps.md @@ -5,4 +5,9 @@ draft = false weight = 4 +++ -## Next Steps +Now that we have cleaned our data, you are ready to dive into data manipulation with pandas. If you want to review cleaning data with Pandas before continuing onward, here are some of our favorite resources: + +1. [Working with Missing Data](https://pandas.pydata.org/docs/user_guide/missing_data.html) +1. [Detect and Remove the Outliers using Python](https://www.geeksforgeeks.org/detect-and-remove-the-outliers-using-python/) +1. [Pandas - Fixing Wrong Data](https://www.w3schools.com/python/pandas/pandas_cleaning_wrong_data.asp) +1. [Pandas - Cleaning Data with Wrong Format](https://www.w3schools.com/python/pandas/pandas_cleaning_wrong_format.asp) diff --git a/content/cleaning-pandas/reading/inconsistent-data/_index.md b/content/cleaning-pandas/reading/inconsistent-data/_index.md index 022a560f..1ed152c0 100644 --- a/content/cleaning-pandas/reading/inconsistent-data/_index.md +++ b/content/cleaning-pandas/reading/inconsistent-data/_index.md @@ -4,7 +4,7 @@ draft = false weight = 5 +++ -Inconsistent data is data that is not properly formatted for the analysis. +Our final type of dirty data we want to clean is inconsistent data. Inconsistent data is data that is not properly formatted for the analysis. This could be data that is strings but should be numbers (`'0'` instead of `0` or `'one hundred'` instead of `100`). Let's study `etsy_sellers` for the final time to see how we can detect and handle inconsistent data. ```console Seller_Id Seller Sales Total_Rating Current_Items Star_Seller @@ -16,16 +16,58 @@ Inconsistent data is data that is not properly formatted for the analysis. 5 21378 Star Stitchery 53,483 4.2 52 0 ``` -If we take a look at `Star_Seller`, we can see that all the star sellers are labeled with `True` and those who aren't star sellers have `0`. +Star sellers meet Etsy's highest standard of customer service so it makes sense that we would have a column in our dataset to designate whether or not someone is a star seller. If we take a look at the new `Star_Seller` column, we can see that all the star sellers are labeled with `'True'` and those who aren't star sellers have `'0'`. Now we need to resolve this inconsistency in order for us to do an effective analysis with the `Star_Seller` column. We can either convert everything to booleans or to numbers. -We can choose either to convert everything either to booleans or numbers. +## The Numbers Era of Star Sellers -Booleans +We are going to start by converting everything to numbers. Once everything in the column is converted to numbers, it will be easier for us to convert the column to booleans. The sellers that are not star sellers are designated with a `'0'` so converting that string to a number is going to be a little more straightforward than converting the string `'True'` to `1`. -Change string 0 to False -Convert +1. First, let's focus in on turning `'True'` to `'1'`. -Numbers + ```python + etsy_sellers = etsy_sellers.loc[etsy_sellers['Star_Seller'] == 'True'] = '1' + ``` -Change string True to 1 -Convert \ No newline at end of file + This code will replace all the values in the `Star_Seller` column with `'1'` only if that value is currently equal to `'True'`. + +1. We can now convert the whole `Star_Seller` column to integers. + + ```python + etsy_sellers.Star_Seller = etsy_sellers.Star_Seller.astype('int64') + ``` + + `astype()` allows us to convert a dataframe or column of a dataframe to a specific type, in this case, `int64`. + +After all of this, our dataframe will look a lot more like this: + +```console + Seller_Id Seller Sales Total_Rating Current_Items Star_Seller +0 8967 Orchid Jewels 17,896 4.5 22 0 +1 908764 Ducky Ducks 5,478 3.8 10 1 +2 7463529 Candy Yarns 89,974 4.8 18 1 +3 161729 Parks Pins 6,897 4.9 87 1 +4 4217 Sierra's Stationary 112,988 4.3 347 0 +5 21378 Star Stitchery 53,483 4.2 52 0 +``` + +## The Booleans Era of Star Sellers + +With the whole `Star_Seller` column converted to integers, we just have to do one more step to convert the whole column to booleans. + +```python +etsy_sellers.Star_Seller = etsy_sellers.Star_Seller.astype('bool') +``` + +With this step, `etsy_sellers` is going to become: + +```console + Seller_Id Seller Sales Total_Rating Current_Items Star_Seller +0 8967 Orchid Jewels 17,896 4.5 22 False +1 908764 Ducky Ducks 5,478 3.8 10 True +2 7463529 Candy Yarns 89,974 4.8 18 True +3 161729 Parks Pins 6,897 4.9 87 True +4 4217 Sierra's Stationary 112,988 4.3 347 False +5 21378 Star Stitchery 53,483 4.2 52 False +``` + +Whether you convert the column to booleans or stay with integers depends entirely on what you need from your analysis and what you find easier to work with later on. This is the case with a lot of cleaning data. The approach you take to cleaning data is heavily dependent on you and what you are hoping to achieve with your analysis. The key for now is to practice and nto be afraid to try something new. \ No newline at end of file diff --git a/content/cleaning-pandas/reading/introduction/_index.md b/content/cleaning-pandas/reading/introduction/_index.md index d83c0fb0..ec152415 100644 --- a/content/cleaning-pandas/reading/introduction/_index.md +++ b/content/cleaning-pandas/reading/introduction/_index.md @@ -4,9 +4,7 @@ draft = false weight = 1 +++ -We have already encountered cleaning data, so we should revisit the topic quicky. - -As we discussed in the previous chapter on cleaning data, we need to clean our data to ensure that our analysis is accurate. If we want to perform a regression analysis later on, the analysis will use all the existing data points. By cleaning our data first, we can ensure that the data points being used in the analysis are what we need. +As we discussed in the [previous chapter]({{< relref "../../../cleaning-spreadsheets" >}}) on cleaning data, we need to clean our data to ensure that our analysis is accurate. For example, if we want to project the price of a stock several months from now, then we would need to use as much data as possible for our analysis. If the data is not clean, then our analysis could be thrown off and depending on how unclean the data is, the predicted price could end up hundreds off. This is why we clean our data before diving into further analysuis. By cleaning our data first, we can ensure that the data points being used in the analysis are what we need. As we previously covered, there are four types of dirty data: @@ -15,4 +13,4 @@ As we previously covered, there are four types of dirty data: 1. unnecessary data 1. inconsistent data -Pandas has different ways of handling these types. Because pandas is so powerful, we can easily search for dirty data and handle it. Let's examine each dirty data type and how we can clean it in pandas. +While we learned lots of different ways to use spreadsheets to clean data, let's see how we can use pandas to clean data. Because pandas is built for data analysis, the library comes with different ways of handling all four dirty data types. Let's examine each dirty data type and how we can clean it in pandas. diff --git a/content/cleaning-pandas/reading/irregular-data/_index.md b/content/cleaning-pandas/reading/irregular-data/_index.md index 86548f7d..d3f6e000 100644 --- a/content/cleaning-pandas/reading/irregular-data/_index.md +++ b/content/cleaning-pandas/reading/irregular-data/_index.md @@ -4,9 +4,9 @@ draft = false weight = 3 +++ -Irregular data is a term that commonly applies to outliers. +Irregular data refers to outliers. Outliers are data points that are abnormal. An abnormal data point might be a stock price dropping by over 10x in a day or a heart rate increasing 3x from the resting heart rate while out on a run. As you approach different outliers, you should recognize that abnormalities do happen in real life so while something seems out of the realm of possibility, we should carefully consider what happened before dismissing it and removing it from the dataset. In the case of the heart rate example, if the patient had a resting heart rate of 100 beats per minute, rising to 300 beats per minute even after exercise, could cause disastrous health effects. Is the dataset about people suffering from tachycardia (an increased heart rate) or other cardiovascular health conditions? Or is the dataset concerning healthy adults and the effects of running on their wellbeing? If it is about tachycardia, then while 300 beats per minute seems like an outlier, we might want to keep it in. If the dataset is about healthy adults engaging in running, then 300 beats per minute might mean that the heart rate was not collected properly or a number was mistyped and we might want to remove this outlier. -Let's revisit `etsy_sellers`. +Let's revisit `etsy_sellers` and see if we have any irregular data we should clean. ```console Seller Sales Total_Rating Current_Items @@ -22,22 +22,32 @@ Because this dataframe is so small, you might be able to spot some data points t ## Descriptive Statistics -We have used descriptive statistics a lot so far, but it really is a data analyst's bread and butter! We might notice that the max and min of the `Sales` are pretty far apart, but since Etsy hosts all sorts of sellers from well-established ones to new businesses so it isn't out of the realm of possiblity that all those numbers are actually appropriate. +We have used descriptive statistics a lot so far, but it really is a data analyst's bread and butter! We might notice that the max and min of the `Sales` are pretty far apart, but since Etsy hosts all sorts of sellers from well-established ones to new businesses, it isn't out of the realm of possiblity that all those numbers are actually appropriate. -However, when we use the `descibe()` function, we might notice that the max is 6.7 which is an outlier. The highest number of stars a shop can have on Etsy is 5 so something is up here and we need to investigate. +However, when we use the `descibe()` function and look more closely at `Total_Rating`, we might notice that the max is 6.7 which is an outlier. The highest number of stars a shop can have on Etsy is 5 so something is up here and we need to investigate. We can then drop the row for Sierra's Stationary by using the `drop()` function. -## Visualizing Outliers - -The two most common visualization types for locating outliers are histograms and scatterplots. +```python -```python -etsy_sellers.plot.scatter(x="Seller",y="Total_Rating") +outlier = np.where((etsy_sellers['Total_Rating'] < 0.0) & (etsy_sellers['Total_Rating'] > 5.0)) +etsy_sellers.drop(etsy_sellers[outlier]) ``` -In this case, while we actually would find it more helpful to do a histogram. +Even though we can visually see where Sierra's Stationary is in the dataframe, if we have one row that is off, we might have others. `np.where()` returns a list of all indices where the condition is met. In this case, the condition is that the rating must be greater than or equal to 0 and less than or equal to 5. + +We can also use visualizations to detect outliers. + +## Visualizing Outliers + +The two most common visualization types for locating outliers are histograms and scatterplots. Which one you choose depends on what portion of your dataset you want to visualize. In the case of visualizing `Total_Rating`, a histogram might be the better option. ```python etsy_sellers.plot.hist(column="Total_Rating") ``` -While one plot visualization helps highlight outliers better than others, it entirely depends on what you are working on. \ No newline at end of file +We could also use a scatterplot if we wanted to try it out. + +```python +etsy_sellers.plot.scatter(x="Seller",y="Total_Rating") +``` + +pandas comes with a number of different visualizations, so feel free to explore the different styles when on a mission to detect outliers. diff --git a/content/cleaning-pandas/reading/missing-data/_index.md b/content/cleaning-pandas/reading/missing-data/_index.md index 93d0d674..78432b2d 100644 --- a/content/cleaning-pandas/reading/missing-data/_index.md +++ b/content/cleaning-pandas/reading/missing-data/_index.md @@ -4,9 +4,7 @@ draft = false weight = 2 +++ -What is missing data? - -Missing data is when a value for either a row or column is not filled in. pandas has different data types to describe missing data. `None` and `NaN` both represent missing values, however, the two are not actually equivalent and the boolean expression `None == nan` evaluates to `False`. This is because `None` is a Python object and `NaN` is a floating point value. While pandas has many built-in ways to handle missing data that treat these two data types as interchangeable, when working on your analysis, you may have to code a custom solution. +Missing data is when a value for either a row or column is not actually there. pandas has different data types for missing data so when you print out a row of a dataframe where data is missing you will see one of these data types. pandas has a number of built-in methods that can handle missing data. `None` and `NaN` both hold missing values, however, the two are not actually equivalent. The boolean expression `None == nan` evaluates to `False`. This is because `None` is a Python object and `NaN` is a floating point value. If you find yourself needing to code a custom solution to handle an issue related to missing data, you might need to keep this in mind! {{% notice blue Note %}} @@ -14,7 +12,7 @@ pandas has even more types to represent a missing value, such as a data type to {{% /notice %}} -pandas is so intelligent that it can account for missing values when doing summary statistics, so we cannot use summary statistics to start to detect our missing values. We need to use built-in functionality to locate these values and handle them. pandas comes with a built-in function called `isna()` to help us here. +pandas can account for missing values when doing summary statistics, so we cannot count on summary statistics to detect our missing values. We need to use built-in functionality to locate these values and handle them. pandas comes with a built-in function called `isna()` to help us here. {{% notice blue Note %}} @@ -39,27 +37,15 @@ my_series.isna() 4 True ``` -When you use `isna()` on a series, you get a series in return except each value is either `True` or `False` depending on whether the value in the series was missing or not. - -You will get a similar outcome with a dataframe when locating missing values. `isna()` returns a dataaframe filled with `True` or `False` depending on whether a value was missing. Now that we have located the missing data, we need to handle it. +When you use `isna()` on a series, you get a series in return. Each value in the returned series is either `True` or `False` depending on whether the value in the series was missing or not. -Depending on what data is missing or why, you can either replace it, remove rows or columns, or further uncover the potential impact of the missing data through interpolation. +You will get a similar outcome with a dataframe when locating missing values. `isna()` returns a dataaframe filled with `True` or `False` depending on whether a value was missing. Now that we have located the missing data, we need to handle it. Depending on what data is missing and why, you can either replace it, remove rows or columns, or further uncover the potential impact of the missing data through interpolation. ## Removing Rows or Columns with Missing Data -This is possible the simplest option to start with. To remove a column or row that contains missing data, pandas comes with the `dropna()` function. +This is possibly the simplest option to start with. To remove a column or row that contains missing data, pandas comes with the `dropna()` function. -Given the following dataframe, called `etsy_sellers`. - -```console - Seller Sales Total_Rating Current_Items -0 Orchid Jewels 17,896 4.5 22 -1 Ducky Ducks 5,478 3.8 10 -2 Candy Yarns 89,974 4.8 18 -3 Parks Pins 6,897 4.9 87 -4 Sierra's Stationary 112,988 4.3 347 -5 Star Stitchery 53,483 4.2 52 -``` +Throughout this chapter, we will use the variations on the following dataframe, called `etsy_sellers`, to ecxamine how we can use pandas to clean data. ```console Seller Sales Total_Rating Current_Items @@ -72,24 +58,18 @@ Given the following dataframe, called `etsy_sellers`. 6 NaN NaN NaN NaN ``` -This dataframe has several missing data points. Let's first examine row 6, which is entirely blank. Assuming this dataset came directly from Etsy, that may indicate a shop in their records that no longer exists. If we are studying currently active Etsy sellers, then we don't need this data so we can drop the whole row. `dropna()` removes all rows that have a missing value, so just runnning `dropna()` would remove rows 1 and 3 in addition to row 6. pandas functions come with so many different options and with every pandas function, we encourage you to always double check the documentation before continuing. +This dataframe has several missing data points. Let's first examine row 6, which is entirely blank. Assuming this dataset came directly from Etsy, that may indicate a shop in their records that no longer exists. If we are studying currently active Etsy sellers for our analysis, then we don't need this data so we can drop the whole row. `dropna()` removes all rows that have a missing value, so just runnning `dropna()` would remove rows 1 and 3 in addition to row 6. pandas functions come with so many different options and with every pandas function, we encourage you to always double check the documentation to see the full scope of those options. The [documentation](https://pandas.pydata.org/docs/dev/reference/api/pandas.DataFrame.dropna.html) specifies how we can drop a row where all the data is missing. ```python etsy_sellers.dropna(how="all") ``` -The above code would drop just row 6 because it is the only row with all null values. `dropna()` defaults to dropping rows, but by changing one parameter we could specify that it should drop any column that contains missing values. +The above code would drop just row 6 because it is the only row with all null values. `dropna()` defaults to dropping rows, but by changing one parameter we could specify that it should drop any column that contains all missing values. ```python -etsy_sellers.dropna(axis="columns") +etsy_sellers.dropna(axis="columns", how="all") ``` -{{% notice blue Note %}} - -For more options with `dropna()`, check out the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html) - -{{% /notice %}} - ## Replacing Missing Values ```console @@ -113,10 +93,54 @@ etsy_sellers.fillna(0) This code would acutally replace every single missing value in the dataframe with 0. But we decided to be a little more intentional and want to just replace the missing values in the `Current_Items` column. ```python -values = {"Current_Items": 0} -etsy_sellers.fillna(value=values) +cols = {"Current_Items": 0} +etsy_sellers.fillna(value=cols) ``` We can specify using a dictionary what column we want to fill and with what we want to fill it. This gives us so much more flexibility! -Because pandas can account for missing values, we can also interpolate what trends are in the data beyond the missing values. \ No newline at end of file +## Interpolating Missing Values + +Because pandas can account for missing values, we can also interpolate what the missing values might be. **Interpolation** means inserting values into a dataset based on exisiting trends in the data. The `interpolate()` function includes a parameter that can specify how you want pandas to interpolate the data. The `method` parameter defaults to a linear interpolation meaning that pandas will fill in the missing values with the assumption that everything is equally spaced like a line. + +```console + Seller Sales Total_Rating Current_Items +0 Orchid Jewels 17,896 4.5 22 +1 Ducky Ducks 5,478 NaN 10 +2 Candy Yarns 89,974 4.8 18 +3 Parks Pins NaN 4.9 0 +4 Sierra's Stationary 112,988 4.3 347 +5 Star Stitchery 53,483 4.2 52 +``` + +The last remaining values are in the `Total_Rating` colunmn and the `Sales` column. Linear interpolation makes sense in neither case. We might want to interpolate what the missing rating is for Ducky Ducks based on what other values in the column are so in that case we can use the pad method. + +```python +etsy_sellers.interpolate(method="pad") +``` + +Interpolation can be a bit of a gamble if you don't understand the underlying trends of the dataset, so you may not see it very often. + +## Check Your Understanding + +{{% notice green Question %}} + +True or False: pandas can account for missing values when performing certain calculations such as summary statistics + +{{% /notice %}} + + + +{{% notice green Question %}} + +Which pandas function detects missing values? Select all that apply. + +1. `dropna()` +1. `isna()` +1. `interpolate()` +1. `fillna()` +1. `isnull()` + +{{% /notice %}} + + \ No newline at end of file diff --git a/content/cleaning-pandas/reading/unnecessary-data/_index.md b/content/cleaning-pandas/reading/unnecessary-data/_index.md index e767636c..a0cbd44a 100644 --- a/content/cleaning-pandas/reading/unnecessary-data/_index.md +++ b/content/cleaning-pandas/reading/unnecessary-data/_index.md @@ -4,7 +4,15 @@ draft = false weight = 4 +++ -Unnecessary data is data that is not vital to your analysis. +Unnecessary data is data that is not vital to your analysis. For example, if you are doing an analysis on tax data, information like a taxpayer's name or street address might not be necessary for your analysis. The taxpayer's name may be helpful in distinguishing one taxpayer from the other, but do you need to distinguish the taxpayer data on an individual basis? The answer is likely not. This means we have some unnecessary data in the dataset that we need to clean. + +{{% notice blue Note %}} + +You may also clean unnecessary data for cybersecurity reasons. Taxpayers' personal identifying information or PII is not something that you want to keep in your dataset in case your company gets hacked. + +{{% /notice %}} + +We have made some updates to `etsy_sellers` so that we can see how to use pandas to remove unnecessary data. ```console Seller_Id Seller Owner_Name Sales Total_Rating Current_Items @@ -16,10 +24,32 @@ Unnecessary data is data that is not vital to your analysis. 5 21378 Star Stitchery Sara George 53,483 4.2 52 ``` -The `Seller_Id` column is a unique identifier given to each seller by Etsy's system and now we have the `Owner_Name` as well. While we want to understand these sellers' sales number, we don't really need their name. +The `Seller_Id` column is a unique identifier given to each seller by Etsy's system and now we have the `Owner_Name` as well. The seller's id can be used to tie this info back to a specific shop so with that there, we should be asking ourselves if we need the shop name and the owner name. While we want to understand these sellers' sales number, we don't really need the owner name. ```python etsy_sellers.drop(columns=['Owner_Name']) ``` -`drop()` defaults to dropping rows so if we want to drop a column we have to pass the column name to `columns`. \ No newline at end of file +`drop()` defaults to dropping rows so if we want to drop a column we have to pass the column name to `columns`. Alternatively, we can use the axis parameter to drop this column: + +```python +etsy_sellers.drop(['Owner_Name'], axis=1) +``` + +With both of these code samples, we need to specify the column name that we want to drop, but some people might prefer one method over the other. Keep an eye out for both ways of dropping a column when reviewing others' code! + +If our analysis only focuses on sellers that do not have any items in the fiber arts space, then we might also want to remove Candy Yarns and Star Stitchery. + +```python +etsy_sellers.drop([2,5]) +``` + +Since we know the indices of Candy Yarns and Star Stitchery, we can drop by index with the above syntax. + +## Check Your Understanding + +{{% notice green Question %}} + +If you are performing an analysis on inflation and grocery prices, do you need the zip code of a grocery store? + +{{% /notice %}} \ No newline at end of file diff --git a/content/cleaning-pandas/studio/_index.md b/content/cleaning-pandas/studio/_index.md index 567c97b7..577a72b5 100644 --- a/content/cleaning-pandas/studio/_index.md +++ b/content/cleaning-pandas/studio/_index.md @@ -9,7 +9,8 @@ weight = 3 For this weeks studio you will be working with a partner. -Each of you should fork this [GitHub repository](https://github.com/launchcodeeducation/DataCleaning-Pumpkin-Sales) and clone to your computer. +Each of you should work in your notebook that can be found at `data-analysis-projects/cleaning-data-with-pandas/studio`. +You will also notice that we have already added a CSV for you. This CSV is a subset of a larger dataset on [Kaggle](https://www.kaggle.com/datasets/usda/a-year-of-pumpkin-prices) from the USDA. You only need this one CSV to do the studio. ## In Your Notebook From 864825d7d640502cecd0a21f86e5de871a67750c Mon Sep 17 00:00:00 2001 From: John Woolbright Date: Mon, 11 Mar 2024 10:29:04 -0500 Subject: [PATCH 3/4] copy/edit for intro and missing-data --- content/cleaning-pandas/reading/introduction/_index.md | 2 +- content/cleaning-pandas/reading/missing-data/_index.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/content/cleaning-pandas/reading/introduction/_index.md b/content/cleaning-pandas/reading/introduction/_index.md index ec152415..92e04abc 100644 --- a/content/cleaning-pandas/reading/introduction/_index.md +++ b/content/cleaning-pandas/reading/introduction/_index.md @@ -4,7 +4,7 @@ draft = false weight = 1 +++ -As we discussed in the [previous chapter]({{< relref "../../../cleaning-spreadsheets" >}}) on cleaning data, we need to clean our data to ensure that our analysis is accurate. For example, if we want to project the price of a stock several months from now, then we would need to use as much data as possible for our analysis. If the data is not clean, then our analysis could be thrown off and depending on how unclean the data is, the predicted price could end up hundreds off. This is why we clean our data before diving into further analysuis. By cleaning our data first, we can ensure that the data points being used in the analysis are what we need. +As we discussed in the [previous chapter]({{% relref "../../../cleaning-spreadsheets" %}}) on cleaning data, we need to clean our data to ensure that our analysis is accurate. For example, if we want to project the price of a stock several months from now, then we would need to use as much data as possible for our analysis. If the data is not clean, then our analysis could be thrown off and depending on how unclean the data is, the predicted price could end up hundreds off. This is why we clean our data before diving into further analysis. By cleaning our data first, we can ensure that the data points being used in the analysis are what we need. As we previously covered, there are four types of dirty data: diff --git a/content/cleaning-pandas/reading/missing-data/_index.md b/content/cleaning-pandas/reading/missing-data/_index.md index 78432b2d..f061d72c 100644 --- a/content/cleaning-pandas/reading/missing-data/_index.md +++ b/content/cleaning-pandas/reading/missing-data/_index.md @@ -39,13 +39,13 @@ my_series.isna() When you use `isna()` on a series, you get a series in return. Each value in the returned series is either `True` or `False` depending on whether the value in the series was missing or not. -You will get a similar outcome with a dataframe when locating missing values. `isna()` returns a dataaframe filled with `True` or `False` depending on whether a value was missing. Now that we have located the missing data, we need to handle it. Depending on what data is missing and why, you can either replace it, remove rows or columns, or further uncover the potential impact of the missing data through interpolation. +You will get a similar outcome with a dataframe when locating missing values. `isna()` returns a dataframe filled with `True` or `False` depending on whether a value was missing. Now that we have located the missing data, we need to handle it. Depending on what data is missing and why, you can either replace it, remove rows or columns, or further uncover the potential impact of the missing data through interpolation. ## Removing Rows or Columns with Missing Data This is possibly the simplest option to start with. To remove a column or row that contains missing data, pandas comes with the `dropna()` function. -Throughout this chapter, we will use the variations on the following dataframe, called `etsy_sellers`, to ecxamine how we can use pandas to clean data. +Throughout this chapter, we will use the variations on the following dataframe, called `etsy_sellers`, to examine how we can use pandas to clean data. ```console Seller Sales Total_Rating Current_Items @@ -90,7 +90,7 @@ pandas comes with a function called `fillna()` that will help us do this. If we etsy_sellers.fillna(0) ``` -This code would acutally replace every single missing value in the dataframe with 0. But we decided to be a little more intentional and want to just replace the missing values in the `Current_Items` column. +This code would actually replace every single missing value in the dataframe with 0. But we decided to be a little more intentional and want to just replace the missing values in the `Current_Items` column. ```python cols = {"Current_Items": 0} @@ -113,7 +113,7 @@ Because pandas can account for missing values, we can also interpolate what the 5 Star Stitchery 53,483 4.2 52 ``` -The last remaining values are in the `Total_Rating` colunmn and the `Sales` column. Linear interpolation makes sense in neither case. We might want to interpolate what the missing rating is for Ducky Ducks based on what other values in the column are so in that case we can use the pad method. +The last remaining values are in the `Total_Rating` column and the `Sales` column. Linear interpolation makes sense in neither case. We might want to interpolate what the missing rating is for Ducky Ducks based on what other values in the column are so in that case we can use the pad method. ```python etsy_sellers.interpolate(method="pad") From 09061f4e49789521537113859ef4dcd07869d6b5 Mon Sep 17 00:00:00 2001 From: John Woolbright Date: Mon, 11 Mar 2024 11:56:53 -0500 Subject: [PATCH 4/4] last copy/edit of review --- content/cleaning-pandas/reading/inconsistent-data/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/cleaning-pandas/reading/inconsistent-data/_index.md b/content/cleaning-pandas/reading/inconsistent-data/_index.md index 1ed152c0..58449c5f 100644 --- a/content/cleaning-pandas/reading/inconsistent-data/_index.md +++ b/content/cleaning-pandas/reading/inconsistent-data/_index.md @@ -70,4 +70,4 @@ With this step, `etsy_sellers` is going to become: 5 21378 Star Stitchery 53,483 4.2 52 False ``` -Whether you convert the column to booleans or stay with integers depends entirely on what you need from your analysis and what you find easier to work with later on. This is the case with a lot of cleaning data. The approach you take to cleaning data is heavily dependent on you and what you are hoping to achieve with your analysis. The key for now is to practice and nto be afraid to try something new. \ No newline at end of file +Whether you convert the column to booleans or stay with integers depends entirely on what you need from your analysis and what you find easier to work with later on. This is the case with a lot of cleaning data. The approach you take to cleaning data is heavily dependent on you and what you are hoping to achieve with your analysis. The key for now is to practice and not be afraid to try something new. \ No newline at end of file