NYPD Shooting Incident Data

NYPD Shooting Incident Data

This is a project that looks at the NYPD Shooting Incident Data (Historic) from the DATA.gov catalog of datasets. This dataset lists every shooting incident in New York City from 2006 through the end of the previous calendar year. It is compiled by NYC Open Data and was last updated on December 11, 2021.

Each row in the data is a "shooting incident" and there are 19 columns.

Loading the Data

library(assertthat)
library(knitr)
library(lubridate)
library(tidyverse)

Lubridate is part of the tidyverse but it has to be loaded for some reason.

DATA_URL <- "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD" 

data = read_csv(DATA_URL)

[1mindexed[0m [32m0B[0m in [36m 0s[0m, [32m0B/s[0m
[1mindexed[0m [32m393.21kB[0m in [36m 0s[0m, [32m983.15kB/s[0m
                                                                              
[1mindexed[0m [32m524.28kB[0m in [36m 1s[0m, [32m1.03MB/s[0m
[1mindexed[0m [32m655.36kB[0m in [36m 1s[0m, [32m1.22MB/s[0m
[1mindexed[0m [32m786.43kB[0m in [36m 1s[0m, [32m1.24MB/s[0m
[1mindexed[0m [32m917.50kB[0m in [36m 1s[0m, [32m1.23MB/s[0m
                                                                              
[1mindexed[0m [32m1.05MB[0m in [36m 1s[0m, [32m1.18MB/s[0m
[1mindexed[0m [32m1.18MB[0m in [36m 1s[0m, [32m1.18MB/s[0m
[1mindexed[0m [32m1.31MB[0m in [36m 1s[0m, [32m1.06MB/s[0m
[1mindexed[0m [32m1.44MB[0m in [36m 2s[0m, [32m947.54kB/s[0m
                                                                              
[1mindexed[0m [32m1.57MB[0m in [36m 2s[0m, [32m1.00MB/s[0m
[1mindexed[0m [32m1.70MB[0m in [36m 2s[0m, [32m1.04MB/s[0m
[1mindexed[0m [32m1.83MB[0m in [36m 2s[0m, [32m1.04MB/s[0m
[1mindexed[0m [32m1.97MB[0m in [36m 2s[0m, [32m1.02MB/s[0m
[1mindexed[0m [32m2.10MB[0m in [36m 2s[0m, [32m1.03MB/s[0m
[1mindexed[0m [32m2.23MB[0m in [36m 2s[0m, [32m994.69kB/s[0m
[1mindexed[0m [32m2.36MB[0m in [36m 2s[0m, [32m965.18kB/s[0m
[1mindexed[0m [32m2.49MB[0m in [36m 3s[0m, [32m939.81kB/s[0m
[1mindexed[0m [32m2.62MB[0m in [36m 3s[0m, [32m951.26kB/s[0m
[1mindexed[0m [32m2.75MB[0m in [36m 3s[0m, [32m919.65kB/s[0m
[1mindexed[0m [32m2.88MB[0m in [36m 3s[0m, [32m855.66kB/s[0m
[1mindexed[0m [32m3.01MB[0m in [36m 3s[0m, [32m892.99kB/s[0m
[1mindexed[0m [32m3.15MB[0m in [36m 3s[0m, [32m904.11kB/s[0m
[1mindexed[0m [32m3.28MB[0m in [36m 4s[0m, [32m867.67kB/s[0m
[1mindexed[0m [32m3.41MB[0m in [36m 4s[0m, [32m878.71kB/s[0m
[1mindexed[0m [32m3.54MB[0m in [36m 4s[0m, [32m888.47kB/s[0m
[1mindexed[0m [32m3.67MB[0m in [36m 4s[0m, [32m899.43kB/s[0m
[1mindexed[0m [32m3.80MB[0m in [36m 4s[0m, [32m846.12kB/s[0m
[1mindexed[0m [32m3.93MB[0m in [36m 5s[0m, [32m861.49kB/s[0m
[1mindexed[0m [32m4.06MB[0m in [36m 5s[0m, [32m876.08kB/s[0m
[1mindexed[0m [32m4.19MB[0m in [36m 5s[0m, [32m854.40kB/s[0m
[1mindexed[0m [32m4.33MB[0m in [36m 5s[0m, [32m877.77kB/s[0m
[1mindexed[0m [32m4.46MB[0m in [36m 5s[0m, [32m855.56kB/s[0m
[1mindexed[0m [32m4.59MB[0m in [36m 5s[0m, [32m865.02kB/s[0m
[1mindexed[0m [32m4.72MB[0m in [36m 6s[0m, [32m855.32kB/s[0m
[1mindexed[0m [32m4.85MB[0m in [36m 6s[0m, [32m847.74kB/s[0m
[1mindexed[0m [32m4.98MB[0m in [36m 6s[0m, [32m870.08kB/s[0m
[1mindexed[0m [32m5.11MB[0m in [36m 6s[0m, [32m892.61kB/s[0m
[1mindexed[0m [32m5.11MB[0m in [36m 6s[0m, [32m892.93kB/s[0m
[1mindexed[0m [32m1.00TB[0m in [36m 6s[0m, [32m174.56GB/s[0m
                                                                              
[1mRows: [22m[34m23585[39m [1mColumns: [22m[34m19[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (10): OCCUR_DATE, BORO, LOCATION_DESC, PERP_AGE_GROUP, PERP_SEX, PERP_R...
[32mdbl[39m   (7): INCIDENT_KEY, PRECINCT, JURISDICTION_CODE, X_COORD_CD, Y_COORD_CD...
[33mlgl[39m   (1): STATISTICAL_MURDER_FLAG
[34mtime[39m  (1): OCCUR_TIME

[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Tidying and Transforming the Data

spec(data)
cols(
  INCIDENT_KEY = [32mcol_double()[39m,
  OCCUR_DATE = [31mcol_character()[39m,
  OCCUR_TIME = [34mcol_time(format = "")[39m,
  BORO = [31mcol_character()[39m,
  PRECINCT = [32mcol_double()[39m,
  JURISDICTION_CODE = [32mcol_double()[39m,
  LOCATION_DESC = [31mcol_character()[39m,
  STATISTICAL_MURDER_FLAG = [33mcol_logical()[39m,
  PERP_AGE_GROUP = [31mcol_character()[39m,
  PERP_SEX = [31mcol_character()[39m,
  PERP_RACE = [31mcol_character()[39m,
  VIC_AGE_GROUP = [31mcol_character()[39m,
  VIC_SEX = [31mcol_character()[39m,
  VIC_RACE = [31mcol_character()[39m,
  X_COORD_CD = [32mcol_double()[39m,
  Y_COORD_CD = [32mcol_double()[39m,
  Latitude = [32mcol_double()[39m,
  Longitude = [32mcol_double()[39m,
  Lon_Lat = [31mcol_character()[39m
)

Incident Key

According to the Data Description the "Incident Key" is a randomly generated identifier for the incident. This is a likely candidate to remove unless we look at specific incidents.

Occur Date

This is the date that the shooting happened. It was loaded as a string so we should convert it to a date.

data$OCCUR_DATE[1]
[1] "08/27/2006"
data$DATE <- as.Date(data$OCCUR_DATE, format="%m/%d/%Y")
str(data$DATE)
data$DATE[1]
assert_that(noNA(data$DATE))
 Date[1:23585], format: "2006-08-27" "2011-03-11" "2019-10-06" "2011-09-04" "2013-05-27" ...
[1] "2006-08-27"
[1] TRUE

Occur Time

This is the time of the shooting. It appears to have already been read in as a time.

str(data$OCCUR_TIME)
assert_that(noNA(data$OCCUR_TIME))
 'hms' num [1:23585] 05:35:00 12:03:00 01:09:00 03:35:00 ...
 - attr(*, "units")= chr "secs"
[1] TRUE

Boro

This is the NYC Borough where the shooting happened.

data$Borough <- as.factor(data$BORO)
assert_that(is.factor(data$Borough))
assert_that(noNA(data$Borough))
[1] TRUE
[1] TRUE

knitr.kable reformats the dataframe to a more readable table. Kind of like tabulate but less flexible.

big_mark <- function(vector, columns){
    kable(table(vector), col.names=columns, format.args=list(big.mark=","))
}
big_mark(data$Borough, c("Borough", "Shootings"))
Borough Shootings
BRONX 6,701
BROOKLYN 9,734
MANHATTAN 2,922
QUEENS 3,532
STATEN ISLAND 696

Precinct

This is the precinct number where the shooting happened.

n_distinct(data$PRECINCT)
str(data$PRECINCT)
data$PRECINCT_FACTOR <- as.factor(data$PRECINCT)
see_if(noNA(data$PRECINCT))
[1] 77
 num [1:23585] 52 106 77 40 100 67 77 81 101 106 ...
[1] TRUE

Jurisdiction Code

The jurisdiction where the shooting occurred. There are three jurisdictions that are NYC jurisdictions:

Code Jurisdiction
0 Patrol
1 Transit
2 Housing

Any numbers higher than this are non-NYC jurisdictions.

data$JURISDICTION <- as.factor(data$JURISDICTION_CODE)
see_if(noNA(data$JURISDICTION))
[1] FALSE
attr(,"msg")
[1] "data$JURISDICTION contains 2 missing values"
big_mark(data$JURISDICTION, c("Jurisdiction", "Shootings"))
Jurisdiction Shootings
0 19,629
1 54
2 3,900

Location Description

data$LOCATION_DESC[1]
see_if(noNA(data$LOCATION_DESC))
[1] NA
[1] FALSE
attr(,"msg")
[1] "data$LOCATION_DESC contains 13581 missing values"
n_distinct(data$LOCATION_DESC)
[1] 40

Here's what the first filled-in entry looks like.

first(na.omit(data$LOCATION_DESC))
[1] "MULTI DWELL - PUBLIC HOUS"

Too bad there's so many missing values.

big_mark(data$LOCATION_DESC, c("Location", "Shootings"))
Location Shootings
ATM 1
BANK 1
BAR/NIGHT CLUB 562
BEAUTY/NAIL SALON 100
CANDY STORE 6
CHAIN STORE 5
CHECK CASH 1
CLOTHING BOUTIQUE 14
COMMERCIAL BLDG 234
DEPT STORE 5
DOCTOR/DENTIST 1
DRUG STORE 11
DRY CLEANER/LAUNDRY 30
FACTORY/WAREHOUSE 6
FAST FOOD 98
GAS STATION 53
GROCERY/BODEGA 574
GYM/FITNESS FACILITY 3
HOSPITAL 38
HOTEL/MOTEL 24
JEWELRY STORE 12
LIQUOR STORE 36
LOAN COMPANY 1
MULTI DWELL - APT BUILD 2,553
MULTI DWELL - PUBLIC HOUS 4,240
NONE 175
PHOTO/COPY STORE 1
PVT HOUSE 857
RESTAURANT/DINER 188
SCHOOL 1
SHOE STORE 9
SMALL MERCHANT 25
SOCIAL CLUB/POLICY LOCATI 66
STORAGE FACILITY 1
STORE UNCLASSIFIED 35
SUPERMARKET 19
TELECOMM. STORE 5
VARIETY STORE 11
VIDEO STORE 2

Statistical Murder Flag

This is a checkbox indicating that the victim died as a result of the shooting.

see_if(noNA(data$STATISTICAL_MURDER_FLAG))
[1] TRUE
big_mark(data$STATISTICAL_MURDER_FLAG, c("Victim Died", "Count"))
Victim Died Count
FALSE 19,085
TRUE 4,500

Perpetrator's Age Group

see_if(noNA(data$PERP_AGE_GROUP))
data$PERP_AGE <- as.factor(data$PERP_AGE_GROUP)
[1] FALSE
attr(,"msg")
[1] "data$PERP_AGE_GROUP contains 8295 missing values"
big_mark(data$PERP_AGE, c("Perpetrator's Age Group", "Shootings"))
Perpetrator's Age Group Shootings
<18 1,368
1020 1
18-24 5,508
224 1
25-44 4,714
45-64 495
65+ 54
940 1
UNKNOWN 3,148

Peprpetrator's Sex

There are three values for "Sex" - Female (F), Male (M), or Unknown (U).

see_if(noNA(data$PERP_SEX))
data$PERP_SEX <- as.factor(data$PERP_SEX)
[1] FALSE
attr(,"msg")
[1] "data$PERP_SEX contains 8261 missing values"
big_mark(data$PERP_SEX, c("Perpetrator's Sex", "Shootings"))
Perpetartor's Sex Shootings
F 335
M 13,490
U 1,499

Perpetrator's Race

see_if(noNA(data$PERP_RACE))
data$PERP_RACE <- as.factor(data$PERP_RACE)
[1] FALSE
attr(,"msg")
[1] "data$PERP_RACE contains 8261 missing values"
big_mark(data$PERP_RACE, c("Perpetrator's Race", "Shootings"))
Perpetrator's Race Shootings
AMERICAN INDIAN/ALASKAN NATIVE 2
ASIAN / PACIFIC ISLANDER 122
BLACK 10,025
BLACK HISPANIC 1,096
UNKNOWN 1,836
WHITE 255
WHITE HISPANIC 1,988

Victim's Age Group

see_if(noNA(data$VIC_AGE_GROUP))
data$VIC_AGE_GROUP <- as.factor(data$VIC_AGE_GROUP)
[1] TRUE
big_mark(data$VIC_AGE_GROUP, c("Victim's Age Group", "Shootings"))
Victim's Age Group Shootings
<18 2,525
18-24 9,003
25-44 10,303
45-64 1,541
65+ 154
UNKNOWN 59

Victim's Sex

see_if(noNA(data$VIC_SEX))
data$VIC_SEX <- as.factor(data$VIC_SEX)
[1] TRUE
big_mark(data$VIC_SEX, c("Victims' Sex", "Shootings"))
Victims' Sex Shootings
F 2,204
M 21,370
U 11

Victim's Race

see_if(noNA(data$VIC_RACE))
data$VIC_RACE <- as.factor(data$VIC_RACE)
[1] TRUE
big_mark(data$VIC_RACE, c("Victim's Race", "Shootings"))
Victim's Race Shootings
AMERICAN INDIAN/ALASKAN NATIVE 9
ASIAN / PACIFIC ISLANDER 327
BLACK 16,869
BLACK HISPANIC 2,245
UNKNOWN 65
WHITE 620
WHITE HISPANIC 3,450

X-Coordinate

"Midblock X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)."

see_if(noNA(data$X_COORD_CD))
n_distinct(data$X_COORD_CD)
[1] TRUE
[1] 9911

Y-Coordinate

see_if(noNA(data$Y_COORD_CD))
n_distinct(data$Y_COORD_CD)
[1] TRUE
[1] 9986

Latitude

Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326).

see_if(noNA(data$Latitude))
n_distinct(data$Latitude)
[1] TRUE
[1] 10055

Longitude

Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326).

see_if(noNA(data$Longitude))
n_distinct(data$Longitude)
str(data$Longitude)
[1] TRUE
[1] 10055
 num [1:23585] -73.9 -73.8 -74 -73.9 -73.8 ...

Longitude and Latitude

Longitude and Latitude Coordinates for mapping

see_if(noNA(data$Lon_Lat))
n_distinct(data$Lon_Lat)
str(data$Lon_Lat)
[1] TRUE
[1] 10055
 chr [1:23585] "POINT (-73.87963173099996 40.86905819000003)" ...

Visualization

floor_date is a function from lubridate that comes as part of the tidyverse. It's called floor because it acts like the floor function in math (round down).

Annually

dates <- data
dates$Year <- floor_date(dates$DATE, "year")

annual_shootings <- dates %>% group_by(Year) %>% summarise(Shootings=n())
kable(annual_shootings,
      format.args=list(big.mark=","))
Year Shootings
2006-01-01 2,055
2007-01-01 1,887
2008-01-01 1,959
2009-01-01 1,828
2010-01-01 1,912
2011-01-01 1,939
2012-01-01 1,717
2013-01-01 1,339
2014-01-01 1,464
2015-01-01 1,434
2016-01-01 1,208
2017-01-01 970
2018-01-01 958
2019-01-01 967
2020-01-01 1,948
theme_set(theme_minimal())
PATH = "../files/posts/nypd-shooting-incident-data/"
join_path = partial(paste, PATH, sep="")
plot = ggplot(data=annual_shootings, aes(x=Year, y=Shootings)) +
  geom_line() + geom_point() +
  scale_x_date(date_labels="%Y", name="Year", date_breaks="1 year") +
  labs(title="NYPD Annual Shootings Incidents Reported")
ggsave(join_path("annual_shootings.png"),
       plot=plot)

nil

  • By Borough
    borough_shootings <- dates %>% group_by(Year, Borough) %>% summarise(Shootings=n())
    
    file_name = "annual_shootings_boroughs.png"
    plot = ggplot(data=borough_shootings, aes(x=Year, y=Shootings, group=Borough, colour=Borough)) +
      geom_line() + geom_point() +
      scale_x_date(date_labels="%Y", name="Year", date_breaks="1 year") +
      labs(title="NYPD Annual Shootings by Borough")
    ggsave(join_path(file_name),
           plot=plot)
    

    nil

Monthly

dates$year_month <- floor_date(dates$DATE, "month")
monthly_shootings <- dates %>% group_by(year_month) %>% summarise(Shootings=n())

monthly_shootings$month_number <- as.numeric(
  format(as.Date(monthly_shootings$year_month), "%m"))
monthly_shootings$Month <- months(as.Date(monthly_shootings$year_month),
                                  abbreviate=TRUE)
monthly_shootings$Year <- year(ymd(monthly_shootings$year_month))
monthly_shootings
plot = ggplot(data=monthly_shootings,
              aes(x=month_number,
                  y=Shootings,
                  group=Year, colour=Year)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks=monthly_shootings$month_number,
                     labels=monthly_shootings$Month, name="Month") +
  labs(title="NYPD Shootings by Month (Year-Over-Year)")

ggsave("../files/posts/nypd-shooting-incident-data/monthly_shootings.png",
       plot=plot)

nil

three_years = monthly_shootings[which(monthly_shootings$Year %in% c(2006, 2018, 2020)),]

plot = ggplot(data=three_years,
              aes(x=month_number,
                  y=Shootings,
                  group=Year, colour=Year)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks=three_years$month_number,
                     labels=three_years$Month, name="Month") +
  labs(title="NYPD Shootings by Month (2006, 2018, 2020)")

ggsave("../files/posts/nypd-shooting-incident-data/monthly_shootings_three-years.png",
       plot=plot)

nil

  • By Borough
    borough_monthly <- dates %>% group_by(year_month, BORO) %>% summarise(Shootings=n())
    
    borough_monthly$monthly_number <- as.numeric(
      format(as.Date(borough_monthly$year_month), "%m"))
    borough_monthly$Month <- months(as.Date(borough_monthly$year_month),
                                    abbreviate=TRUE)
    borough_monthly$Year <- year(ymd(borough_monthly$year_month))
    borough_monthly
    
    boroughs_2020 = borough_monthly[which(borough_monthly$Year %in% c(2020)),]
    
    plot = ggplot(data=boroughs_2020,
                  aes(x=monthly_number,
                      y=Shootings,
                      group=BORO, colour=BORO)) +
      geom_line() + geom_point() +
      scale_x_continuous(breaks=boroughs_2020$monthly_number,
                         labels=boroughs_2020$Month, name="Month") +
      labs(title="NYPD Shootings by Borough (2020)")
    
    ggsave(join_path("monthly_shootings_by_borough.png"),
           plot=plot)
    

    nil

By Map

dates$Year <- year(ymd(dates$year_month))
last_year <- dates[dates$Year==2020,]
last_year
minimum_latitude = min(last_year$Latitude)
maximum_latitude = max(last_year$Latitude)
minimum_longitude = min(last_year$Longitude)
maximum_longitude = max(last_year$Longitude)


plot = ggplot(data=last_year,
              aes(x=Longitude,
                  y=Latitude, color=BORO)) +
  geom_point(size=1) +
  scale_x_continuous(limits=c(minimum_longitude, maximum_longitude)) +
  scale_y_continuous(limits=c(minimum_latitude, maximum_latitude)) +
  labs(title="NYPD Shootings by Latitude and Longitude (2020)")

ggsave(join_path("monthly_shootings_latitude_longitude.png"),
       plot=plot)

nil