import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_percentage_error
from more_itertools import powerset

drugs = pd.read_csv("drugs.csv")
crimes = pd.read_csv("state_crime.csv")


drugs.head()


crimes.head()


drugs.dtypes

State                                                 object
Year                                                   int64
Population.12-17                                       int64
Population.18-25                                       int64
Population.26+                                         int64
Totals.Alcohol.Use Disorder Past Year.12-17            int64
Totals.Alcohol.Use Disorder Past Year.18-25            int64
Totals.Alcohol.Use Disorder Past Year.26+              int64
Rates.Alcohol.Use Disorder Past Year.12-17           float64
Rates.Alcohol.Use Disorder Past Year.18-25           float64
Rates.Alcohol.Use Disorder Past Year.26+             float64
Totals.Alcohol.Use Past Month.12-17                    int64
Totals.Alcohol.Use Past Month.18-25                    int64
Totals.Alcohol.Use Past Month.26+                      int64
Rates.Alcohol.Use Past Month.12-17                   float64
Rates.Alcohol.Use Past Month.18-25                   float64
Rates.Alcohol.Use Past Month.26+                     float64
Totals.Tobacco.Cigarette Past Month.12-17              int64
Totals.Tobacco.Cigarette Past Month.18-25              int64
Totals.Tobacco.Cigarette Past Month.26+                int64
Rates.Tobacco.Cigarette Past Month.12-17             float64
Rates.Tobacco.Cigarette Past Month.18-25             float64
Rates.Tobacco.Cigarette Past Month.26+               float64
Totals.Illicit Drugs.Cocaine Used Past Year.12-17      int64
Totals.Illicit Drugs.Cocaine Used Past Year.18-25      int64
Totals.Illicit Drugs.Cocaine Used Past Year.26+        int64
Rates.Illicit Drugs.Cocaine Used Past Year.12-17     float64
Rates.Illicit Drugs.Cocaine Used Past Year.18-25     float64
Rates.Illicit Drugs.Cocaine Used Past Year.26+       float64
Totals.Marijuana.New Users.12-17                       int64
Totals.Marijuana.New Users.18-25                       int64
Totals.Marijuana.New Users.26+                         int64
Rates.Marijuana.New Users.12-17                      float64
Rates.Marijuana.New Users.18-25                      float64
Rates.Marijuana.New Users.26+                        float64
Totals.Marijuana.Used Past Month.12-17                 int64
Totals.Marijuana.Used Past Month.18-25                 int64
Totals.Marijuana.Used Past Month.26+                   int64
Rates.Marijuana.Used Past Month.12-17                float64
Rates.Marijuana.Used Past Month.18-25                float64
Rates.Marijuana.Used Past Month.26+                  float64
Totals.Marijuana.Used Past Year.12-17                  int64
Totals.Marijuana.Used Past Year.18-25                  int64
Totals.Marijuana.Used Past Year.26+                    int64
Rates.Marijuana.Used Past Year.12-17                 float64
Rates.Marijuana.Used Past Year.18-25                 float64
Rates.Marijuana.Used Past Year.26+                   float64
Totals.Tobacco.Use Past Month.12-17                    int64
Totals.Tobacco.Use Past Month.18-25                    int64
Totals.Tobacco.Use Past Month.26+                      int64
Rates.Tobacco.Use Past Month.12-17                   float64
Rates.Tobacco.Use Past Month.18-25                   float64
Rates.Tobacco.Use Past Month.26+                     float64
dtype: object


crimes.dtypes

State                             object
Year                               int64
Data.Population                    int64
Data.Rates.Property.All          float64
Data.Rates.Property.Burglary     float64
Data.Rates.Property.Larceny      float64
Data.Rates.Property.Motor        float64
Data.Rates.Violent.All           float64
Data.Rates.Violent.Assault       float64
Data.Rates.Violent.Murder        float64
Data.Rates.Violent.Rape          float64
Data.Rates.Violent.Robbery       float64
Data.Totals.Property.All           int64
Data.Totals.Property.Burglary      int64
Data.Totals.Property.Larceny       int64
Data.Totals.Property.Motor         int64
Data.Totals.Violent.All            int64
Data.Totals.Violent.Assault        int64
Data.Totals.Violent.Murder         int64
Data.Totals.Violent.Rape           int64
Data.Totals.Violent.Robbery        int64
dtype: object


def get_columns(dataframe, key, num):
    to_be_dropped = []

    for col in list(dataframe.columns)[num:]: # never drop certain columns
        if key not in col:
            to_be_dropped.append(col)
                    
    return dataframe.drop(to_be_dropped, axis=1)

def get_not_columns(dataframe, key, num):
    to_be_dropped = []

    for col in list(dataframe.columns)[num:]: # never drop certain columns
        if key in col:
            to_be_dropped.append(col)
                    
    return dataframe.drop(to_be_dropped, axis=1)


def graph_drug(df, cols, labels, ylim):
    
    fig, axes = plt.subplots(1,3)

    fig.set_figheight(5)
    fig.set_figwidth(15)
    
    def subgraph(col, ax, label, ylim):
        df.set_index("Year").groupby("State")[col].plot.line(
            ylim=(0, ylim), alpha=0.7, fontsize=14, ax=ax);

        ax.set_xlabel("Year", fontsize=15);
        ax.set_ylabel("Fraction", fontsize=15);
        ax.set_title(label, fontsize=15);
        ax.tick_params(labelrotation=45)
        
    for i in range(3):
        subgraph(cols[i], axes[i], labels[i], ylim)

    return fig.tight_layout()


def capitalize(string):
    return str.upper(string[0]) + string[1:]


def regress(df, labels, xs, y):
    
    fig, axes = plt.subplots(1,3)
    fig.set_figheight(5)
    fig.set_figwidth(15)
    
    def subplot(label, x, ax):
        res = stats.linregress(df[x], df[y])
        title = "Slope: " + str(np.round(res.slope, 3))
        
        df.plot.scatter(x=x, y=y, ax=ax);
        ax.plot(df[x], res.intercept + res.slope*df[x], color="red")
        
        ax.set_xlabel(label, fontsize=12);
        ax.set_ylabel(capitalize(y), fontsize=12);
        ax.set_title(title, fontsize=15);
        ax.set_xlim((-2,2))
        ax.set_ylim((-2,2))
        
        
    for i in range(3):
        subplot(labels[i], xs[i], axes[i])
        
    return fig.tight_layout()


def graph_coefs(df, xs, title):
    ys = ["burglary", "larceny", "motor", "assault", "murder", "rape", "robbery"]

    #blue-ish for property, red-ish for violent
    colors = ["lightsteelblue", "deepskyblue", "royalblue", "darksalmon", "red", "darkorange", "firebrick"]

    # we then get the coefficients for every combination
    all_coefs = pd.DataFrame({"Crime": ys})
    all_coefs.set_index("Crime", inplace=True)
    for x in xs:
        coefs = []
        for y in ys:
            res = stats.linregress(df[x], df[y])
            coefs.append(res.slope)
        all_coefs[x] = coefs
    all_coefs = all_coefs.transpose()
    all_coefs.reset_index(inplace=True)

    # we plot those results here
    fig, ax = plt.subplots()

    for i in range(len(ys)):
        all_coefs.plot.scatter(x="index", y=ys[i], ax=ax, marker='o', color=colors[i], s=100, alpha=0.65)
    
    plt.title(title)
    plt.legend(ys)
    ax.set_ylabel("Correlation Coefficients");
    fig.set_figheight(5)
    ax.set_ylim((-1,1))
    fig.set_figwidth(15)
    fig.set_figheight(6)


def get_scaled_state(df, state):
    s = df[df["State"] == state]
    return pd.concat([s.iloc[:, :2], stats.zscore(s.iloc[:, 2:])], axis=1)


# get just percentages, not raw user numbers
drugs_pct = get_columns(drugs, "Rates", 2)


# rename columns
drugs_pct.rename(columns={
    "Rates.Alcohol.Use Disorder Past Year.12-17": "alcoholism past year:12-17",
    "Rates.Alcohol.Use Disorder Past Year.18-25": "alcoholism past year:18-25",
    "Rates.Alcohol.Use Disorder Past Year.26+": "alcoholism past year:26+",
    "Rates.Alcohol.Use Past Month.12-17": "alcohol used past month:12-17",
    "Rates.Alcohol.Use Past Month.18-25": "alcohol used past month:18-25",
    "Rates.Alcohol.Use Past Month.26+": "alcohol used past month:26+",
    
    "Rates.Illicit Drugs.Cocaine Used Past Year.12-17": "cocaine used past year:12-17",
    "Rates.Illicit Drugs.Cocaine Used Past Year.18-25": "cocaine used past year:18-25",
    "Rates.Illicit Drugs.Cocaine Used Past Year.26+": "cocaine used past year:26+",
    
    "Rates.Marijuana.Used Past Month.12-17": "marijuana used past month:12-17",
    "Rates.Marijuana.Used Past Month.18-25": "marijuana used past month:18-25",
    "Rates.Marijuana.Used Past Month.26+": "marijuana used past month:26+",
    "Rates.Marijuana.Used Past Year.12-17": "marijuana used past year:12-17",
    "Rates.Marijuana.Used Past Year.18-25": "marijuana used past year:18-25",
    "Rates.Marijuana.Used Past Year.26+": "marijuana used past year:26+",
    
    "Rates.Tobacco.Use Past Month.12-17": "tobacco used past month:12-17",
    "Rates.Tobacco.Use Past Month.18-25": "tobacco used past month:18-25",
    "Rates.Tobacco.Use Past Month.26+": "tobacco used past month:26+",
}, inplace=True)


# get alochol related usage rates
alcohol_pct = get_columns(drugs_pct, "alcohol", 2)

# get tobacco related usage rates
tobacco_pct = get_not_columns(get_columns(drugs_pct, "tobacco", 2), "Cigarette", 2)

# get cocaine related usage rates
cocaine_pct = get_columns(drugs_pct, "cocaine", 2)

# get marijuana related usage rates
marijuana_pct = get_not_columns(get_columns(drugs_pct, "marijuana", 2), "New Users", 2)


# rename columns
crimes.rename(columns={
    "Data.Rates.Property.Burglary": "burglary",
    "Data.Rates.Property.Larceny": "larceny",
    "Data.Rates.Property.Motor": "motor",
    "Data.Rates.Violent.Assault": "assault",
    "Data.Rates.Violent.Murder": "murder",
    "Data.Rates.Violent.Rape": "rape",
    "Data.Rates.Violent.Robbery": "robbery"
},inplace=True)


crimes_pct = get_not_columns(crimes, "Data", 2)


print('State with highest alcohol use in the past year rate from 2002 to 2018:', 
      str(alcohol_pct.set_index(["State", "Year"])["alcoholism past year:18-25"].idxmax()) + 
      ';', alcohol_pct.set_index(["State", "Year"])["alcoholism past year:18-25"].max())

print('State with lowest alcohol use in the past year rate from 2002 to 2018:', 
      str(alcohol_pct.set_index(["State", "Year"])["alcoholism past year:18-25"].idxmin()) + 
      ';', alcohol_pct.set_index(["State", "Year"])["alcoholism past year:18-25"].min())

State with highest alcohol use in the past year rate from 2002 to 2018: ('North Dakota', 2003); 0.272941
State with lowest alcohol use in the past year rate from 2002 to 2018: ('Florida', 2018); 0.071218


alcohol_pct['alcoholism past year:18-25'].mean()

0.15118001384083027


cols = ["alcoholism past year:12-17", "alcoholism past year:18-25", "alcoholism past year:26+"]
labels = ["Alcoholism: Ages 12 - 17", "Alcoholism: Ages 18 - 25", "Alcoholism: Ages 26+"]

graph_drug(alcohol_pct, cols, labels, 0.3)


cols = ["tobacco used past month:12-17", "tobacco used past month:18-25", "tobacco used past month:26+"]
labels = ["Tobacco Use in the Past Month: Ages 12-17", "Tobacco Use in the Past Month: Ages 18-25", "Tobacco Use in the Past Month: Ages 26+"]

graph_drug(tobacco_pct, cols, labels, 0.65)


cols = ["cocaine used past year:12-17", "cocaine used past year:18-25", "cocaine used past year:26+"]
labels = ["Cocaine Use in the Past Year: Ages 12-17", "Cocaine Use in the Past Year: Ages 18-25", "Cocaine Use in the Past Year: Ages 26+"]

graph_drug(cocaine_pct, cols, labels, 0.13)


cols = ["marijuana used past year:12-17", "marijuana used past year:18-25", "marijuana used past year:26+"]
labels = ["Marijuana Use in the Past Year: Ages 12-17", "Marijuana Use in the Past Year: Ages 18-25", "Marijuana Use in the Past Year: Ages 26+"]

graph_drug(marijuana_pct, cols, labels, 0.55)


#this is the df that just has the rates and within the relevant years
crimes_pct = crimes_pct[(crimes_pct.Year >= 2002) & (crimes_pct.Year<=2018)].iloc[:, :12]
crimes_pct.head()


fig, axes = plt.subplots(3,3)
fig.set_figwidth(14)
fig.set_figheight(14)


crimes_list = ["burglary", "larceny", "motor", "assault", "murder", "rape", "robbery"]
starter = crimes_pct.set_index("Year").groupby("State")

counter = 0
for i in range(3):
    for j in range(3):
        if counter < 7:
            starter[crimes_list[counter]].plot.line(ylabel="Offenses per 100,000 People", ax = axes[i][j])
            axes[i][j].set_title(capitalize(crimes_list[counter]))
            counter += 1
            
fig.delaxes(axes[2][1])
fig.delaxes(axes[2][2])
fig.tight_layout()


features = list(alcohol_pct.iloc[:, 5:].columns
                .append(cocaine_pct.iloc[:, 1:].columns)
                .append(tobacco_pct.iloc[:, 2:].columns)
                .append(marijuana_pct.iloc[:, 5:].columns))

merged = drugs_pct.merge(crimes_pct, on = ["State", "Year"], how="inner")

all_combos = list(powerset(features))[7099:] #less than 9 features is never optimal


crimes = ["burglary", "larceny", "motor", "assault", "murder", "rape", "robbery"]
for crime in crimes:
    final_mape = np.inf
    for features in all_combos:

        model = KNeighborsRegressor(n_neighbors=2) # the best value is always 2
        x_train = merged[list(features)]
        y_train = merged[crime]
        
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train_sc = scaler.transform(x_train)
        
        model.fit(x_train_sc, y_train)

        y_pred = model.predict(x_train_sc)
        mape = mean_absolute_percentage_error(y_train, y_pred)

        if mape < final_mape:
            final_mape = mape
            final_features = list(features)

    print("crime:", crime)
    print("mape:", np.round(final_mape*100, 2), "%")
    print("# of features:", len(final_features))
    print("features:", final_features)
    print("---------------------------------------------------------------------------------")

crime: burglary
mape: 5.47 %
# of features: 12
features: ['alcohol used past month:12-17', 'alcohol used past month:18-25', 'alcohol used past month:26+', 'Year', 'cocaine used past year:12-17', 'cocaine used past year:18-25', 'cocaine used past year:26+', 'tobacco used past month:12-17', 'tobacco used past month:18-25', 'tobacco used past month:26+', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------
crime: larceny
mape: 3.7 %
# of features: 11
features: ['alcohol used past month:12-17', 'alcohol used past month:18-25', 'alcohol used past month:26+', 'Year', 'cocaine used past year:12-17', 'cocaine used past year:26+', 'tobacco used past month:12-17', 'tobacco used past month:18-25', 'tobacco used past month:26+', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------
crime: motor
mape: 7.75 %
# of features: 10
features: ['alcohol used past month:12-17', 'alcohol used past month:18-25', 'alcohol used past month:26+', 'Year', 'cocaine used past year:12-17', 'cocaine used past year:26+', 'tobacco used past month:18-25', 'tobacco used past month:26+', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------
crime: assault
mape: 7.49 %
# of features: 11
features: ['alcohol used past month:12-17', 'alcohol used past month:18-25', 'alcohol used past month:26+', 'Year', 'cocaine used past year:12-17', 'cocaine used past year:26+', 'tobacco used past month:18-25', 'tobacco used past month:26+', 'marijuana used past year:12-17', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------
crime: murder
mape: 9.64 %
# of features: 11
features: ['alcohol used past month:12-17', 'alcohol used past month:18-25', 'alcohol used past month:26+', 'Year', 'cocaine used past year:12-17', 'cocaine used past year:26+', 'tobacco used past month:18-25', 'tobacco used past month:26+', 'marijuana used past year:12-17', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------
crime: rape
mape: 7.08 %
# of features: 9
features: ['alcohol used past month:12-17', 'alcohol used past month:26+', 'Year', 'cocaine used past year:18-25', 'cocaine used past year:26+', 'tobacco used past month:12-17', 'tobacco used past month:26+', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------
crime: robbery
mape: 8.31 %
# of features: 11
features: ['alcohol used past month:12-17', 'alcohol used past month:18-25', 'alcohol used past month:26+', 'Year', 'cocaine used past year:12-17', 'cocaine used past year:26+', 'tobacco used past month:12-17', 'tobacco used past month:18-25', 'tobacco used past month:26+', 'marijuana used past year:18-25', 'marijuana used past year:26+']
---------------------------------------------------------------------------------


merged = drugs_pct.merge(crimes_pct, on=["State", "Year"], how="inner")


la_sc = get_scaled_state(merged, "Louisiana")


labels = ["Alcohol Use in Past Month: Ages 12-17", "Alcohol Use in Past Month: Ages 18-25", "Alcohol Use in Past Month: Ages 26+"]
xs = ["alcohol used past month:12-17", "alcohol used past month:18-25", "alcohol used past month:26+"]
y = "burglary"

regress(la_sc, labels, xs, y)


labels = ["Cocaine Use in Past Year: Ages 12-17", "Cocaine Use in Past Year: Ages 18-25", "Cocaine Use in Past Year: Ages 26+"]
xs = ["cocaine used past year:12-17", "cocaine used past year:18-25", "cocaine used past year:26+"]
y = "larceny"

regress(la_sc, labels, xs, y)


xs = ["alcohol used past month:12-17", "alcohol used past month:18-25", "alcohol used past month:26+"]
graph_coefs(la_sc, xs, "Alcohol Use in the Past Month and All Crimes in Lousiana")


xs = ['tobacco used past month:12-17', 'tobacco used past month:18-25', 'tobacco used past month:26+']
graph_coefs(la_sc, xs, "Tobacco Use in the Past Year and All Crimes in Lousiana")


xs = ['cocaine used past year:12-17', 'cocaine used past year:18-25', 'cocaine used past year:26+']
graph_coefs(la_sc, xs, "Cocaine Use in the Past Year and All Crimes in Lousiana")


xs = ['marijuana used past year:12-17', 'marijuana used past year:18-25', 'marijuana used past year:26+']
graph_coefs(la_sc, xs, "Marijuana Use in the Past Year and All Crimes in Lousiana")

	State	Year	Population.12-17	Population.18-25	Population.26+	Totals.Alcohol.Use Disorder Past Year.12-17	Totals.Alcohol.Use Disorder Past Year.18-25	Totals.Alcohol.Use Disorder Past Year.26+	Rates.Alcohol.Use Disorder Past Year.12-17	Rates.Alcohol.Use Disorder Past Year.18-25	...	Totals.Marijuana.Used Past Year.26+	Rates.Marijuana.Used Past Year.12-17	Rates.Marijuana.Used Past Year.18-25	Rates.Marijuana.Used Past Year.26+	Totals.Tobacco.Use Past Month.12-17	Totals.Tobacco.Use Past Month.18-25	Totals.Tobacco.Use Past Month.26+	Rates.Tobacco.Use Past Month.12-17	Rates.Tobacco.Use Past Month.18-25	Rates.Tobacco.Use Past Month.26+
0	Alabama	2002	380805	499453	2812905	18	68	138	0.048336	0.136490	...	141	0.127535	0.237880	0.050275	63	226	930	0.166578	0.451976	0.330659
1	Alaska	2002	69400	62791	368460	4	12	27	0.061479	0.187891	...	46	0.188730	0.389026	0.124566	11	30	112	0.163918	0.484270	0.304220
2	Arizona	2002	485521	602265	3329482	36	117	258	0.073819	0.193626	...	215	0.169646	0.275435	0.064640	73	240	1032	0.151071	0.397968	0.309969
3	Arkansas	2002	232986	302029	1687337	14	53	101	0.061457	0.175913	...	104	0.157567	0.288856	0.061510	46	169	660	0.195714	0.558846	0.391210
4	California	2002	3140739	3919577	21392421	173	581	1298	0.055109	0.148312	...	1670	0.141067	0.282887	0.078068	290	1377	4721	0.092235	0.351353	0.220699

	State	Year	Data.Population	Data.Rates.Property.All	Data.Rates.Property.Burglary	Data.Rates.Property.Larceny	Data.Rates.Property.Motor	Data.Rates.Violent.All	Data.Rates.Violent.Assault	Data.Rates.Violent.Murder	...	Data.Rates.Violent.Robbery	Data.Totals.Property.All	Data.Totals.Property.Burglary	Data.Totals.Property.Larceny	Data.Totals.Property.Motor	Data.Totals.Violent.All	Data.Totals.Violent.Assault	Data.Totals.Violent.Murder	Data.Totals.Violent.Rape	Data.Totals.Violent.Robbery
0	Alabama	1960	3266740	1035.4	355.9	592.1	87.3	186.6	138.1	12.4	...	27.5	33823	11626	19344	2853	6097	4512	406	281	898
1	Alabama	1961	3302000	985.5	339.3	569.4	76.8	168.5	128.9	12.9	...	19.1	32541	11205	18801	2535	5564	4255	427	252	630
2	Alabama	1962	3358000	1067.0	349.1	634.5	83.4	157.3	119.0	9.4	...	22.5	35829	11722	21306	2801	5283	3995	316	218	754
3	Alabama	1963	3347000	1150.9	376.9	683.4	90.6	182.7	142.1	10.2	...	24.7	38521	12614	22874	3033	6115	4755	340	192	828
4	Alabama	1964	3407000	1358.7	466.6	784.1	108.0	213.1	163.0	9.3	...	29.1	46290	15898	26713	3679	7260	5555	316	397	992

	State	Year	burglary	larceny	motor	assault	murder	rape	robbery
42	Alabama	2002	950.6	2767.0	310.1	268.0	6.8	37.2	133.1
43	Alabama	2003	960.2	2754.1	332.1	251.7	6.6	36.8	134.1
44	Alabama	2004	987.0	2732.4	309.9	249.4	5.6	38.5	133.5
45	Alabama	2005	955.8	2656.0	289.0	248.3	8.2	34.4	141.7
46	Alabama	2006	973.7	2640.8	326.5	227.5	8.3	35.8	153.6

The Relationship Between Drug Use and Crime Rates¶

Jonathan Licht and Jim Haines¶

https://jlicht27.github.io/¶

Overview¶

Libraries and Importing¶

Some useful functions¶

Data Preprocessing¶

Drugs¶

Alcohol¶

Tobacco¶

Cocaine¶

Marijuana¶

Crimes¶

Drug/Alcohol Use and Crimes¶

Country Level¶

KNN Model¶

Louisiana¶

Alcohol Use in the Past Month and Burglary, in Lousiana¶

Cocaine Use in the Past Year and Larceny, in Lousiana¶

Correlation Coefficients Visualized¶

Conclusion¶