Pandas is a powerful Python library for data manipulation and analysis. It provides data structures and functions needed to efficiently work with structured data. Pandas is built on top of NumPy and is an essential tool in any data scientist's toolkit.
# Install pandas if you haven't already
# !pip install pandas
# Import pandas
import pandas as pd
import numpy as np # Often used alongside pandas
# Creating a Series (1D array with labels)
s = pd.Series([1, 3, 5, 7, 9],
index=['a', 'b', 'c', 'd', 'e'])
print("Series example:")
print(s)
print(f"Series type: {type(s)}\\\\n")
# Creating a DataFrame from a dictionary
data = {
'Name': ['John', 'Anna', 'Peter', 'Linda'],
'Age': [28, 34, 29, 42],
'City': ['New York', 'Paris', 'Berlin', 'London'],
'Salary': [90000, 85000, 72000, 95000]
}
df = pd.DataFrame(data)
print("DataFrame created from dictionary:")
print(df)
print(f"DataFrame type: {type(df)}\\\\n")
# Creating a DataFrame from a list of lists
data_list = [
['John', 28, 'New York', 90000],
['Anna', 34, 'Paris', 85000],
['Peter', 29, 'Berlin', 72000],
['Linda', 42, 'London', 95000]
]
columns = ['Name', 'Age', 'City', 'Salary']
df2 = pd.DataFrame(data_list, columns=columns)
print("DataFrame created from list of lists:")
print(df2)
# Creating a DataFrame from a NumPy array
array_data = np.random.rand(4, 4) # 4x4 array of random values
df3 = pd.DataFrame(array_data,
columns=['A', 'B', 'C', 'D'],
index=['Row1', 'Row2', 'Row3', 'Row4'])
print("\\\\nDataFrame created from NumPy array:")
print(df3)
# Creating a DataFrame with different data types
df4 = pd.DataFrame({
'A': [1, 2, 3, 4], # integers
'B': [1.1, 2.2, 3.3, 4.4], # floats
'C': ['a', 'b', 'c', 'd'], # strings
'D': [True, False, True, False], # booleans
'E': pd.date_range('20230101', periods=4) # dates
})
print("\\\\nDataFrame with different data types:")
print(df4)
print(df4.dtypes)
# Reading data from CSV
# df_csv = pd.read_csv('data.csv')
# print(df_csv.head())
# Let's create a sample CSV first
df.to_csv('sample_data.csv', index=False)
print("\\\\nCSV file created.")
# Now read it back
df_from_csv = pd.read_csv('sample_data.csv')
print("\\\\nDataFrame read from CSV:")
print(df_from_csv)
# Reading from Excel
# df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# print(df_excel.head())
# Writing to Excel
df.to_excel('sample_data.xlsx', sheet_name='People', index=False)
print("\\\\nExcel file created.")
# JSON
df.to_json('sample_data.json', orient='records')
print("\\\\nJSON file created.")
df_from_json = pd.read_json('sample_data.json')
print("\\\\nDataFrame read from JSON:")
print(df_from_json)
# SQL (using SQLite as an example)
import sqlite3
conn = sqlite3.connect('sample_database.db')
df.to_sql('employees', conn, if_exists='replace', index=False)
print("\\\\nData written to SQLite database.")
# Read from SQL
df_from_sql = pd.read_sql('SELECT * FROM employees', conn)
print("\\\\nDataFrame read from SQL:")
print(df_from_sql)
conn.close()