-
Notifications
You must be signed in to change notification settings - Fork 0
Data Preparation 2016
Adam VanIwaarden edited this page Aug 24, 2016
·
2 revisions
Data from three testing periods were provided to the Center for Assessment in July 2016. This data is to be used in the Spring 2016 SGP analyses.
In the source code below, the data supplied by Pearson are comma seperated files for each state in the PARCC consortium. These files are
located in the states' relative folder Data/Base_Files. A custom function is used to read each file into R and simultaneously combined
into a single data table. The data is then cleaned up variable by variable to ensure that it conforms to data naming conventions used
in the SGP
software package.
The actual R code script is available here.
################################################################################
### ###
### Create PARCC LONG Data for Spring 2016 Analyses ###
### ###
################################################################################
### Load required packages
require(SGP)
require(data.table)
setwd("PARCC")
###
### Read in Spring 2016 Pearson base data
###
#### Set names based on Pearson file layout
parcc.var.names <-
c("AssessmentYear", "StateAbbreviation", "PARCCStudentIdentifier",
"GradeLevelWhenAssessed", "Period", "TestCode", "SummativeScoreRecordUUID",
"StudentTestUUID", "SummativeScaleScore", "IRTTheta", "SummativeCSEM",
"Filler", "TestFormat")
center.var.names <-
c("StudentGrowthPercentileComparedtoState",
"StudentGrowthPercentileComparedtoPARCC","SGPPreviousTestCodeState",
"SGPPreviousTestCodePARCC", "SGPUpperBoundState", "SGPLowerBoundState",
"SGPUpperBoundPARCC", "SGPLowerBoundPARCC")
all.var.names <- c(head(parcc.var.names,-1), center.var.names, "TestFormat")
#### Function to read in individual state files
read.parcc <- function(state, year, Fall=FALSE) {
if (state == "BI") {
tmp.name <- "Bureau_Indian_Affairs"
} else tmp.name <- gsub(" ", "_", SGP:::getStateAbbreviation(state, type="state"))
if(tmp.name=="WASHINGTON_DC") tmp.name <- "Washington_DC"
tmp.files <- list.files(file.path(tmp.name, "Data/Base_Files"))
my.zip <- grep(year, tmp.files, value=TRUE)
my.file <- gsub(".zip", "", my.zip)
tmp.dir <- getwd()
setwd(tempdir())
system(paste("unzip '", file.path(tmp.dir, tmp.name, "Data/Base_Files", my.zip), "'", sep=""))
if (year=="2014-2015" | Fall==TRUE) {
tmp.var.names <- head(all.var.names,-1)
cols <- 20
} else {
tmp.var.names <- all.var.names
cols <- 21
}
TMP <- fread(my.file, sep=',', header=FALSE, skip=1L, colClasses=rep("character", cols))
unlink(my.file)
setwd(tmp.dir)
setnames(TMP, tmp.var.names)
return(TMP)
}
#### Spring 2016
PARCC_Data_LONG_2016 <- rbindlist(list(read.parcc("BI", "D201607"),
read.parcc("CO", "D201607"), read.parcc("IL", "D201607"),
read.parcc("MD", "D201607"), read.parcc("MA", "D201607"),
read.parcc("NJ", "D201607"), read.parcc("NM", "D201607"),
read.parcc("RI", "D201607"), read.parcc("DC", "D201607")))
setkey(PARCC_Data_LONG_2016, PARCCStudentIdentifier, TestCode, Period)
PARCC_Data_LONG_2016 <- PARCC_Data_LONG_2016[, parcc.var.names, with=FALSE]
###
### Data Cleaning - Create Required SGP Variables
###
#### ID
setnames(PARCC_Data_LONG_2016, "PARCCStudentIdentifier", "ID")
#### CONTENT_AREA from TestCode
PARCC_Data_LONG_2016[, CONTENT_AREA := factor(TestCode)]
levels(PARCC_Data_LONG_2016$CONTENT_AREA) <-
c("ALGEBRA_I", "ALGEBRA_II", rep("ELA", 9), "GEOMETRY", rep("MATHEMATICS", 6),
"INTEGRATED_MATH_1", "INTEGRATED_MATH_2", "INTEGRATED_MATH_3")
PARCC_Data_LONG_2016[, CONTENT_AREA := as.character(CONTENT_AREA)]
#### GRADE from TestCode
PARCC_Data_LONG_2016[, GRADE := gsub("ELA|MAT", "", TestCode)]
PARCC_Data_LONG_2016[, GRADE := as.character(as.numeric(GRADE))]
PARCC_Data_LONG_2016[which(is.na(GRADE)), GRADE := "EOCT"]
PARCC_Data_LONG_2016[, GRADE := as.character(GRADE)]
#### YEAR
PARCC_Data_LONG_2016[, YEAR := gsub("-", "_", AssessmentYear)]
PARCC_Data_LONG_2016[which(Period == "FallBlock"), YEAR := paste(YEAR, "1", sep=".")]
PARCC_Data_LONG_2016[which(Period == "Spring"), YEAR := paste(YEAR, "2", sep=".")]
#### Valid Cases
PARCC_Data_LONG_2016[, VALID_CASE := "VALID_CASE"]
#### Invalidate Cases with missing IDs - 0 invalid in FINAL data
PARCC_Data_LONG_2016[which(is.na(ID)), VALID_CASE := "INVALID_CASE"]
####
#### Establish seperate Theta and Scale Score long data sets
####
PARCC_Data_LONG_SS <- copy(PARCC_Data_LONG_2016)
PARCC_Data_LONG_SS[, c("IRTTheta", "Filler") := NULL]
PARCC_Data_LONG_SS[, CONTENT_AREA := paste(CONTENT_AREA, "SS", sep="_")]
setnames(PARCC_Data_LONG_SS, c("SummativeScaleScore", "SummativeCSEM"), c("SCALE_SCORE", "SCALE_SCORE_CSEM"))
#### Theta data set - create IRT CSEM First
scaling.constants <- as.data.table(
read.csv("./PARCC/Data/Base_Files/2015-2016 PARCC Scaling Constants.csv"))
setkey(scaling.constants, CONTENT_AREA, GRADE)
setkey(PARCC_Data_LONG_2016, CONTENT_AREA, GRADE)
PARCC_Data_LONG_2016 <- scaling.constants[PARCC_Data_LONG_2016]
PARCC_Data_LONG_2016[, SCALE_SCORE_CSEM := (as.numeric(SummativeCSEM))/a] # NO -b here...
PARCC_Data_LONG_2016[, c("a", "b", "Filler") := NULL]
setnames(PARCC_Data_LONG_2016,
c("IRTTheta", "SummativeScaleScore", "SummativeCSEM"),
c("SCALE_SCORE", "SCALE_SCORE_ACTUAL", "SCALE_SCORE_CSEM_ACTUAL"))
#### Stack Theta and SS Data
PARCC_Data_LONG_2016 <- rbindlist(list(PARCC_Data_LONG_2016, PARCC_Data_LONG_SS), fill=TRUE)
PARCC_Data_LONG_2016[, CONTENT_AREA := as.character(CONTENT_AREA)]
#### Save 2016 LONG Data
require(RSQLite)
parcc.db <- "./PARCC/Data/PARCC_Data_LONG.sqlite"
db.order <- dbListFields(dbConnect(SQLite(), dbname = parcc.db), name = "PARCC_Data_LONG_2015_2")
setcolorder(PARCC_Data_LONG_2016, db.order) # MUST Match 2015 order
PARCC_Data_LONG_2016[, GRADE := as.character(GRADE)]
PARCC_Data_LONG_2016[, SCALE_SCORE := as.numeric(SCALE_SCORE)]
PARCC_Data_LONG_2016[, SCALE_SCORE_CSEM := as.numeric(SCALE_SCORE_CSEM)]
PARCC_Data_LONG_2016[, SCALE_SCORE_ACTUAL := as.numeric(SCALE_SCORE_ACTUAL)]
PARCC_Data_LONG_2016[, SCALE_SCORE_CSEM_ACTUAL := as.numeric(SCALE_SCORE_CSEM_ACTUAL)]
#### Save 2016 LONG Data
save(PARCC_Data_LONG_2016, file = "./PARCC/Data/PARCC_Data_LONG_2016.Rdata")
##### Add 2016 LONG data to existing SQLite Database. Tables stored by each year / period
dbWriteTable(dbConnect(SQLite(), dbname = parcc.db),
name = "PARCC_Data_LONG_2016_2", value=PARCC_Data_LONG_2016, overwrite=TRUE)
SGP - Student Growth Percentiles SGP Blog | SGP GitHub Repo | SGP on CRAN