The purpose of diffdf
is to provide proc compare
like functionality to R for use
in second line programming. In particular we focus on raising warnings
if any differences are found whilst providing in-depth diagnostics to
highlight where these differences have occurred.
Here we show the basic functionality of diffdf
using a
dummy data set.
library(diffdf)
LENGTH <- 30
suppressWarnings(RNGversion("3.5.0"))
set.seed(12334)
test_data <- tibble::tibble(
ID = 1:LENGTH,
GROUP1 = rep(c(1, 2), each = LENGTH / 2),
GROUP2 = rep(c(1:(LENGTH / 2)), 2),
INTEGER = rpois(LENGTH, 40),
BINARY = sample(c("M", "F"), LENGTH, replace = TRUE),
DATE = lubridate::ymd("2000-01-01") + rnorm(LENGTH, 0, 7000),
DATETIME = lubridate::ymd_hms("2000-01-01 00:00:00") + rnorm(LENGTH, 0, 200000000),
CONTINUOUS = rnorm(LENGTH, 30, 12),
CATEGORICAL = factor(sample(c("A", "B", "C"), LENGTH, replace = TRUE)),
LOGICAL = sample(c(TRUE, FALSE), LENGTH, replace = TRUE),
CHARACTER = stringi::stri_rand_strings(LENGTH, rpois(LENGTH, 13), pattern = "[ A-Za-z0-9]")
)
test_data
#> # A tibble: 30 × 11
#> ID GROUP1 GROUP2 INTEGER BINARY DATE DATETIME CONTINUOUS
#> <int> <dbl> <int> <int> <chr> <date> <dttm> <dbl>
#> 1 1 1 1 41 M 2003-06-22 2000-11-28 20:40:53 21.6
#> 2 2 1 2 41 F 2016-12-03 1994-08-30 19:05:02 26.5
#> 3 3 1 3 41 M 2016-05-08 1992-09-11 11:30:18 16.1
#> 4 4 1 4 32 M 2015-06-02 2007-11-12 11:28:29 23.5
#> 5 5 1 5 55 F 1986-04-15 1998-08-04 01:27:49 21.2
#> 6 6 1 6 33 M 1994-05-25 2001-12-05 08:24:35 46.9
#> 7 7 1 7 40 F 2009-02-08 1986-11-02 18:13:03 28.2
#> 8 8 1 8 44 F 2020-07-21 1998-08-22 05:23:24 27.7
#> 9 9 1 9 51 F 1967-05-25 2003-01-03 22:09:29 22.0
#> 10 10 1 10 40 M 2044-03-11 1996-04-19 11:10:12 40.9
#> # ℹ 20 more rows
#> # ℹ 3 more variables: CATEGORICAL <fct>, LOGICAL <lgl>, CHARACTER <chr>
diffdf(test_data, test_data)
#> No issues were found!
As you would expect no differences are found. We now look to
introduce various types differences into the data in order to show how
diffdf
highlights them. Note that for the purposes of this
vignette we have used the suppress_warnings
argument to
stop errors being raised; it is recommended however that this option is
not used in production code as it may mask problems.
test_data2 <- test_data
test_data2 <- test_data2[, -6]
diffdf(test_data, test_data2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 30
#> Columns(#) 11 10
#> ------------------------------------------------------------------
#>
#>
#> There are columns in BASE that are not in COMPARE !!
#> =========
#> COLUMNS
#> ---------
#> DATE
#> ---------
test_data2 <- test_data
test_data2 <- test_data2[1:(nrow(test_data2) - 2), ]
diffdf(test_data, test_data2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 28
#> Columns(#) 11 11
#> ------------------------------------------------------------------
#>
#>
#> There are rows in BASE that are not in COMPARE !!
#> ===============
#> ..ROWNUMBER..
#> ---------------
#> 29
#> 30
#> ---------------
test_data2 <- test_data
test_data2[5, 2] <- 6
diffdf(test_data, test_data2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 30
#> Columns(#) 11 11
#> ------------------------------------------------------------------
#>
#>
#> Not all Values Compared Equal
#> =============================
#> Variable No of Differences
#> -----------------------------
#> GROUP1 1
#> -----------------------------
#>
#>
#> ========================================
#> VARIABLE ..ROWNUMBER.. BASE COMPARE
#> ----------------------------------------
#> GROUP1 5 1 6
#> ----------------------------------------
test_data2 <- test_data
test_data2[, 2] <- as.character(test_data2[, 2])
diffdf(test_data, test_data2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 30
#> Columns(#) 11 11
#> ------------------------------------------------------------------
#>
#>
#> There are columns in BASE and COMPARE with different modes !!
#> ================================
#> VARIABLE MODE.BASE MODE.COMP
#> --------------------------------
#> GROUP1 numeric character
#> --------------------------------
#>
#>
#> There are columns in BASE and COMPARE with different classes !!
#> ==================================
#> VARIABLE CLASS.BASE CLASS.COMP
#> ----------------------------------
#> GROUP1 numeric character
#> ----------------------------------
test_data2 <- test_data
attr(test_data$ID, "label") <- "This is a interesting label"
attr(test_data2$ID, "label") <- "what do I type here?"
diffdf(test_data, test_data2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 30
#> Columns(#) 11 11
#> ------------------------------------------------------------------
#>
#>
#> There are columns in BASE and COMPARE with differing attributes !!
#> ============================================================================
#> VARIABLE ATTR_NAME VALUES.BASE VALUES.COMP
#> ----------------------------------------------------------------------------
#> ID label "This is a interesting label" "what do I type here?"
#> ----------------------------------------------------------------------------
test_data2 <- test_data
levels(test_data2$CATEGORICAL) <- c(1, 2, 3)
diffdf(test_data, test_data2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 30
#> Columns(#) 11 11
#> ------------------------------------------------------------------
#>
#>
#> There are columns in BASE and COMPARE with differing attributes !!
#> ============================================================
#> VARIABLE ATTR_NAME VALUES.BASE VALUES.COMP
#> ------------------------------------------------------------
#> CATEGORICAL levels c("A", "B", "C") c("1", "2", "3")
#> ------------------------------------------------------------
#>
#>
#> Not all Values Compared Equal
#> ================================
#> Variable No of Differences
#> --------------------------------
#> CATEGORICAL 30
#> --------------------------------
#>
#>
#> First 10 of 30 rows are shown in table below
#> ===========================================
#> VARIABLE ..ROWNUMBER.. BASE COMPARE
#> -------------------------------------------
#> CATEGORICAL 1 C 3
#> CATEGORICAL 2 C 3
#> CATEGORICAL 3 A 1
#> CATEGORICAL 4 C 3
#> CATEGORICAL 5 A 1
#> CATEGORICAL 6 A 1
#> CATEGORICAL 7 A 1
#> CATEGORICAL 8 A 1
#> CATEGORICAL 9 C 3
#> CATEGORICAL 10 B 2
#> -------------------------------------------
A key feature of diffdf
that enables easier diagnostics
is the ability to specify which variables form a unique row i.e. which
rows should be compared against each other based upon a key. By default
if no key is specified diffdf
will use the row numbers as
the key however in general this isn’t recommended as it means two
identical datasets simply sorted differently can lead to
incomprehensible error messages as every observation is flagged as
different. In diffdf
keys can be specified as character
vectors using the keys
argument.
test_data2 <- test_data
test_data2$INTEGER[c(5, 2, 15)] <- 99L
diffdf(test_data, test_data2, keys = c("GROUP1", "GROUP2"), suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ==================================================================
#> PROPERTY BASE COMP
#> ------------------------------------------------------------------
#> Name test_data test_data2
#> Class "tbl_df, tbl, data.frame" "tbl_df, tbl, data.frame"
#> Rows(#) 30 30
#> Columns(#) 11 11
#> ------------------------------------------------------------------
#>
#>
#> Not all Values Compared Equal
#> =============================
#> Variable No of Differences
#> -----------------------------
#> INTEGER 3
#> -----------------------------
#>
#>
#> =========================================
#> VARIABLE GROUP1 GROUP2 BASE COMPARE
#> -----------------------------------------
#> INTEGER 1 2 41 99
#> INTEGER 1 5 55 99
#> INTEGER 1 15 44 99
#> -----------------------------------------
As an additional utility diffdf
comes with the function
diffdf_issuerows()
which can be used to subset your dataset
against the issue object to return just the rows that are flagged as
containing issues.
iris2 <- iris
for (i in 1:3) iris2[i, i] <- 99
diff <- diffdf(iris, iris2, suppress_warnings = TRUE)
diffdf_issuerows(iris, diff)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
diffdf_issuerows(iris2, diff)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 99.0 3.5 1.4 0.2 setosa
#> 2 4.9 99.0 1.4 0.2 setosa
#> 3 4.7 3.2 99.0 0.2 setosa
Bear in mind that the vars
option can be used to just
subset down to issues associated with particular variables.
diffdf_issuerows(iris2, diff, vars = "Sepal.Length")
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 99 3.5 1.4 0.2 setosa
diffdf_issuerows(iris2, diff, vars = c("Sepal.Length", "Sepal.Width"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 99.0 3.5 1.4 0.2 setosa
#> 2 4.9 99.0 1.4 0.2 setosa
Sometimes it can be useful to use the comparison result to fuel
further checks or programming logic. To assist with this
diffdf
offers two pieces of functionality namely the
suppress_warnings
argument (which has already been shown)
and the diffdf_has_issues()
helper function which simply
returns TRUE if differences have been found else FALSE.
You can use the tolerance
argument of
diffdf
to define how sensitive the comparison should be to
decimal place inaccuracies. This important as very often floating point
numbers will not compare equal due to machine rounding as they cannot be
perfectly represented in binary. By default tolerance is set to
sqrt(.Machine$double.eps)
dsin1 <- data.frame(x = 1.1e-06)
dsin2 <- data.frame(x = 1.1e-07)
diffdf(dsin1, dsin2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ====================================
#> PROPERTY BASE COMP
#> ------------------------------------
#> Name dsin1 dsin2
#> Class data.frame data.frame
#> Rows(#) 1 1
#> Columns(#) 1 1
#> ------------------------------------
#>
#>
#> Not all Values Compared Equal
#> =============================
#> Variable No of Differences
#> -----------------------------
#> x 1
#> -----------------------------
#>
#>
#> ===========================================
#> VARIABLE ..ROWNUMBER.. BASE COMPARE
#> -------------------------------------------
#> x 1 1.1e-06 1.1e-07
#> -------------------------------------------
diffdf(dsin1, dsin2, tolerance = 0.001, suppress_warnings = TRUE)
#> No issues were found!
By default, the function will note a difference between integer and
double columns, and factor and character columns. It can be useful in
some contexts to prevent this from occurring. We can do so with the
strict_numeric = FALSE
and
strict_factor = FALSE
arguments.
dsin1 <- data.frame(x = as.integer(c(1, 2, 3)))
dsin2 <- data.frame(x = as.numeric(c(1, 2, 3)))
diffdf(dsin1, dsin2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ====================================
#> PROPERTY BASE COMP
#> ------------------------------------
#> Name dsin1 dsin2
#> Class data.frame data.frame
#> Rows(#) 3 3
#> Columns(#) 1 1
#> ------------------------------------
#>
#>
#> There are columns in BASE and COMPARE with different classes !!
#> ==================================
#> VARIABLE CLASS.BASE CLASS.COMP
#> ----------------------------------
#> x integer numeric
#> ----------------------------------
diffdf(dsin1, dsin2, suppress_warnings = TRUE, strict_numeric = FALSE)
#> NOTE: Variable x in base was casted to numeric
#> No issues were found!
dsin1 <- data.frame(x = as.character(c(1, 2, 3)), stringsAsFactors = FALSE)
dsin2 <- data.frame(x = as.factor(c(1, 2, 3)))
diffdf(dsin1, dsin2, suppress_warnings = TRUE)
#> Differences found between the objects!
#>
#> Summary of BASE and COMPARE
#> ====================================
#> PROPERTY BASE COMP
#> ------------------------------------
#> Name dsin1 dsin2
#> Class data.frame data.frame
#> Rows(#) 3 3
#> Columns(#) 1 1
#> ------------------------------------
#>
#>
#> There are columns in BASE and COMPARE with different modes !!
#> ================================
#> VARIABLE MODE.BASE MODE.COMP
#> --------------------------------
#> x character numeric
#> --------------------------------
#>
#>
#> There are columns in BASE and COMPARE with different classes !!
#> ==================================
#> VARIABLE CLASS.BASE CLASS.COMP
#> ----------------------------------
#> x character factor
#> ----------------------------------
diffdf(dsin1, dsin2, suppress_warnings = TRUE, strict_factor = FALSE)
#> NOTE: Variable x in compare was casted to character
#> No issues were found!