Querying a DataFrame
Practice session with Pandas dataframe queries
This page keeps the original notebook-style practice content while using the current Repath site header, page header, and footer.
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
df = pd.read_csv('/content/clean_agri_ds.csv')
In [5]:
df.head(2)
Out[5]:
In [7]:
df.describe()
Out[7]:
In [8]:
df.drop(['Len State', 'Len Dist', 'Len Season', 'Len Crop'], inplace = True, axis=1)
In [9]:
df.head(2)
Out[9]:
In [10]:
df.set_index(['Unnamed: 0'])
Out[10]:
In [11]:
df.rename(columns = {'Unnamed: 0':"ColIndex"}, inplace = True)
In [12]:
df.set_index(['ColIndex'])
Out[12]:
In [13]:
df['Crop_Year'] = df['Crop_Year'].astype(str)
In [14]:
df['ColIndex'] = df['ColIndex'].astype(str)
In [15]:
df.describe()
Out[15]:
Conditionally retrieving data from a dataframe with QUERY¶
DataFrame.query(expr, inplace=False, **kwargs)
In [16]:
df.loc[df.State_Name == "Andhra Pradesh", 'District_Name'].unique()
Out[16]:
In [17]:
df.loc[df['State_Name'] == 'Andhra Pradesh']['District_Name'].unique()
Out[17]:
In [18]:
df.query('State_Name == "Andhra Pradesh"')['District_Name'].unique()
Out[18]:
In [19]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM"')
Out[19]:
In [20]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Season == "Rabi"')
Out[20]:
In [21]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Season == "Rabi" and Crop_Year == "2011"' )
Out[21]:
In [22]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Crop_Year == "2011" and Season == "Rabi" and Crop == "Onion"')
Out[22]:
In [23]:
df_dist = df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM"')
In [24]:
df_dist.head(1)
Out[24]:
In [25]:
def getDistNames_forState(strStateName):
print(df.query('State_Name == "' + strStateName + '"')['Crop'].unique())
In [26]:
getDistNames_forState('Andhra Pradesh')
In [27]:
display(df['State_Name'].unique())
In [28]:
def getDistrict_Names_for_State(strStateName):
print(df.loc[df['State_Name'] == strStateName, "District_Name"].iloc[:].unique())
In [29]:
getDistrict_Names_for_State("Arunachal Pradesh")
In [30]:
getDistrict_Names_for_State("Andhra Pradesh")
examples¶
- df.loc[(df["B"] > 50) & (df["C"] == 900), "A"].values
- df.loc[(df["B"] > 50) & (df["C"] == 900), "A"]
In [31]:
def getCropNames_for_State_District(strStateName, strDistName):
print(df.loc[(df['State_Name'] == strStateName) & (df['District_Name'] == strDistName), "Crop"].unique())
In [32]:
getCropNames_for_State_District("Arunachal Pradesh", "WEST KAMENG")
In [35]:
getCropNames_for_State_District("Assam","KARBI ANGLONG" )
In [36]:
df.loc[df['State_Name'] == "Andhra Pradesh", "District_Name"].iloc[:].unique()
Out[36]:
In [37]:
getDistrict_Names_for_State("Arunachal Pradesh")
In [38]:
getDistrict_Names_for_State("Bihar")
In [39]:
print(getCropNames_for_State_District("Bihar", "MADHEPURA"))
In [64]:
ArPr_Crops = list(getCropNames_for_State_District("Bihar", "MADHEPURA"))
In [ ]:
ArPr_Crops
In [ ]:
lstArPrCrops = []
for value in ArPr_Crops:
lstArPrCrops.append(value)
In [62]:
getCropNames_for_State_District("Arunachal Pradesh", 'DIBANG VALLEY')
In [63]:
df.loc[df['State_Name'] == "Andhra Pradesh", "District_Name"].values[:]
Out[63]:
Conditionally retrieving data from a dataframe with FILTER¶
DataFrame.filter(items=None, like=None, regex=None, axis=None)
In [41]:
df.filter(["State_Name", "Production", "Area"])
Out[41]:
In [42]:
df.filter(["Crop_Year", "State_Name","Area"]).groupby('Crop_Year').sum()
Out[42]:
Use filter() to retrieve all columns in a dataframe, which has, for example, the letter ‘a’ or ‘A’ in its name.¶
In [43]:
df.filter(regex = '[Aa]')
Out[43]:
Names dataset - Open DataSource¶
In [44]:
url = "https://data.chhs.ca.gov/dataset/4a8cb74f-c4fa-458a-8ab1-5f2c0b2e22e3/resource/f3fe42ed-4441-4fd8-bf53-92fb80a246da/download/2021-06-18_topbabynames_1960-2019.csv"
ndf = pd.read_csv(url)
In [45]:
ndf.head()
Out[45]:
In [46]:
ndf.describe()
Out[46]:
In [47]:
ndf[ndf['Name'].str.startswith('AL')]
Out[47]:
In [48]:
ndf[ndf["Name"].str.endswith('RA')]
Out[48]:
In [49]:
ndf[ndf['Name'].str.contains("AS")]
Out[49]:
In [50]:
ndf[ndf['Name'].str.contains("AS", case = False)]
Out[50]:
In [61]:
ndf[ndf['Name'].str.contains("IAN$|INE$", case = False, regex = True)].head()
Out[61]:
In [52]:
ndf[ndf['Name'].str.contains("IAN$|INE$", case = False, regex = False)]
Out[52]:
In [60]:
ndf[((ndf['Name'] == 'SPPHIA') | (ndf["Name"] == "PAUL"))].head()
Out[60]:
In [59]:
ndf[ndf["Name"].isin(["SOPHIA", "PAUL"])].head(4)
Out[59]:
In [55]:
ndf.filter(items=['Year', 'Name'], axis=1)
Out[55]:
In [56]:
ndf.filter(like='17', axis = 0)
Out[56]:
In [57]:
ndf.filter(regex = '^5|8|3', axis=0)
Out[57]:
In [58]:
ndf.isnull().count()
Out[58]:
In [58]: