Database schemas

Enhances a relation_schema object with foreign key reference information.

Usage

database_schema(relation_schemas, references)

Arguments

relation_schemas: a relation_schema object, as returned by synthesise or relation_schema.
references: a list of references, each represented by a list containing four character elements. In order, the elements are a scalar giving the name of the child (referrer) schema, a vector giving the child attribute names, a scalar giving the name of the parent (referee) schema, and a vector giving the parent attribute names. The vectors must be of the same length and contain names for attributes present in their respective schemas, and the parent attributes must form a key.

Value

A database_schema object, containing relation_schemas with references stored in an attribute of the same name. References are stored with their attributes in the order they appear in their respective relation schemas.

Details

Unlike functional_dependency and relation_schema, database_schema is not designed to be vector-like: it only holds a single database schema. This adheres to the usual package use case, where a single data frame is being analysed at a time. However, it inherits from relation_schema, so is vectorised with respect to its relation schemas.

As with relation_schema, duplicate relation schemas, after ordering by attribute, are allowed, and can be removed with unique.

References, i.e. foreign key references, are allowed to have different attribute names in the child and parent relations; this can't occur in the output for autoref and normalise.

Subsetting removes any references that involve removed relation schemas. Removing duplicates with unique changes references involving duplicates to involve the kept equivalent schemas instead. Renaming relation schemas with names<- also changes their names in the references.

Examples

rs <- relation_schema(
  list(
    a = list(c("a", "b"), list("a")),
    b = list(c("b", "c"), list("b", "c"))
  ),
  attrs_order = c("a", "b", "c", "d")
)
ds <- database_schema(
  rs,
  list(list("a", "b", "b", "b"))
)
print(ds)
#> database schema with 2 relation schemas
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> schema b: b, c
#>   key 1: b
#>   key 2: c
#> references:
#> a.{b} -> b.{b}
attrs(ds)
#> $a
#> [1] "a" "b"
#> 
#> $b
#> [1] "b" "c"
#> 
keys(ds)
#> $a
#> $a[[1]]
#> [1] "a"
#> 
#> 
#> $b
#> $b[[1]]
#> [1] "b"
#> 
#> $b[[2]]
#> [1] "c"
#> 
#> 
attrs_order(ds)
#> [1] "a" "b" "c" "d"
names(ds)
#> [1] "a" "b"
references(ds)
#> [[1]]
#> [[1]][[1]]
#> [1] "a"
#> 
#> [[1]][[2]]
#> [1] "b"
#> 
#> [[1]][[3]]
#> [1] "b"
#> 
#> [[1]][[4]]
#> [1] "b"
#> 
#> 

# relations can't reference themselves
if (FALSE) { # \dontrun{
  database_schema(
    relation_schema(
      list(a = list("a", list("a"))),
      c("a", "b")
    ),
    list(list("a", "a", "a", "a"))
  )
  database_schema(
    relation_schema(
      list(a = list(c("a", "b"), list("a"))),
      c("a", "b")
    ),
    list(list("a", "b", "a", "a"))
  )
} # }

# an example with references between differently-named attributes
print(database_schema(
  relation_schema(
    list(
      citation = list(c("citer", "citee"), list(c("citer", "citee"))),
      article = list("article", list("article"))
    ),
    c("citer", "citee", "article")
  ),
  list(
    list("citation", "citer", "article", "article"),
    list("citation", "citee", "article", "article")
  )
))
#> database schema with 2 relation schemas
#> 3 attributes: citer, citee, article
#> schema citation: citer, citee
#>   key 1: citer, citee
#> schema article: article
#>   key 1: article
#> references:
#> citation.{citer} -> article.{article}
#> citation.{citee} -> article.{article}

# vector operations
ds2 <- database_schema(
  relation_schema(
    list(
      e = list(c("a", "e"), list("e"))
    ),
    attrs_order = c("a", "e")
  ),
  list()
)
c(ds, ds2) # attrs_order attributes are merged
#> database schema with 3 relation schemas
#> 5 attributes: a, b, c, d, e
#> schema a: a, b
#>   key 1: a
#> schema b: b, c
#>   key 1: b
#>   key 2: c
#> schema e: e, a
#>   key 1: e
#> references:
#> a.{b} -> b.{b}
unique(c(ds, ds))
#> database schema with 2 relation schemas
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> schema b: b, c
#>   key 1: b
#>   key 2: c
#> references:
#> a.{b} -> b.{b}

# subsetting
ds[1]
#> database schema with 1 relation schema
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> no references
stopifnot(identical(ds[[1]], ds[1]))
ds[c(1, 2, 1, 2)] # replicates the foreign key references
#> database schema with 4 relation schemas
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> schema b: b, c
#>   key 1: b
#>   key 2: c
#> schema a.1: a, b
#>   key 1: a
#> schema b.1: b, c
#>   key 1: b
#>   key 2: c
#> references:
#> a.{b} -> b.{b}
#> a.{b} -> b.1.{b}
#> a.1.{b} -> b.{b}
#> a.1.{b} -> b.1.{b}
c(ds[c(1, 2)], ds[c(1, 2)]) # doesn't reference between separate copies of ds
#> database schema with 4 relation schemas
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> schema b: b, c
#>   key 1: b
#>   key 2: c
#> schema a.1: a, b
#>   key 1: a
#> schema b.1: b, c
#>   key 1: b
#>   key 2: c
#> references:
#> a.{b} -> b.{b}
#> a.1.{b} -> b.1.{b}
unique(ds[c(1, 2, 1, 2)]) # unique() also merges references
#> database schema with 2 relation schemas
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> schema b: b, c
#>   key 1: b
#>   key 2: c
#> references:
#> a.{b} -> b.{b}

# another example of unique() merging references
ds_merge <- database_schema(
  relation_schema(
    list(
      a = list(c("a", "b"), list("a")),
      b = list(c("b", "c", "d"), list("b")),
      c_d = list(c("c", "d", "e"), list(c("c", "d"))),
      a.1 = list(c("a", "b"), list("a")),
      b.1 = list(c("b", "c", "d"), list("b"))
    ),
    c("a", "b", "c", "d", "e")
  ),
  list(
    list("a", "b", "b", "b"),
    list("b.1", c("c", "d"), "c_d", c("c", "d"))
  )
)
print(ds_merge)
#> database schema with 5 relation schemas
#> 5 attributes: a, b, c, d, e
#> schema a: a, b
#>   key 1: a
#> schema b: b, c, d
#>   key 1: b
#> schema c_d: c, d, e
#>   key 1: c, d
#> schema a.1: a, b
#>   key 1: a
#> schema b.1: b, c, d
#>   key 1: b
#> references:
#> a.{b} -> b.{b}
#> b.1.{c, d} -> c_d.{c, d}
unique(ds_merge)
#> database schema with 3 relation schemas
#> 5 attributes: a, b, c, d, e
#> schema a: a, b
#>   key 1: a
#> schema b: b, c, d
#>   key 1: b
#> schema c_d: c, d, e
#>   key 1: c, d
#> references:
#> a.{b} -> b.{b}
#> b.{c, d} -> c_d.{c, d}

# reassignment
# can't change keys included in references
if (FALSE) keys(ds)[[2]] <- list("c") # \dontrun{}
# can't remove attributes included in keys
if (FALSE) attrs(ds)[[2]] <- list("c", "d") # \dontrun{}
# can't remove attributes included in references
if (FALSE) attrs(ds)[[1]] <- c("a", "d") # \dontrun{}
ds3 <- ds
# can change subset of schema, but loses references between altered and
# non-altered subsets
ds3[2] <- database_schema(
  relation_schema(
    list(d = list(c("d", "c"), list("d"))),
    attrs_order(ds3)
  ),
  list()
)
print(ds3) # note the schema's name doesn't change
#> database schema with 2 relation schemas
#> 4 attributes: a, b, c, d
#> schema a: a, b
#>   key 1: a
#> schema b: d, c
#>   key 1: d
#> no references
# names(ds3)[2] <- "d" # this would change the name
keys(ds3)[[2]] <- list(character()) # removing keys first...
attrs(ds3)[[2]] <- c("b", "c") # so we can change the attrs legally
keys(ds3)[[2]] <- list("b", "c") # add the new keys
# add the reference lost during subset replacement
references(ds3) <- c(references(ds3), list(list("a", "b", "b", "b")))
stopifnot(identical(ds3, ds))

# changing appearance priority for attributes
attrs_order(ds3) <- c("d", "c", "b", "a")
print(ds3)
#> database schema with 2 relation schemas
#> 4 attributes: d, c, b, a
#> schema a: a, b
#>   key 1: a
#> schema b: c, b
#>   key 1: c
#>   key 2: b
#> references:
#> a.{b} -> b.{b}

# changing relation schema names changes them in references
names(ds3) <- paste0(names(ds3), "_long")
print(ds3)
#> database schema with 2 relation schemas
#> 4 attributes: d, c, b, a
#> schema a_long: a, b
#>   key 1: a
#> schema b_long: c, b
#>   key 1: c
#>   key 2: b
#> references:
#> a_long.{b} -> b_long.{b}

# reconstructing from components
ds_recon <- database_schema(
  relation_schema(
    Map(list, attrs(ds), keys(ds)),
    attrs_order(ds)
  ),
  references(ds)
)
stopifnot(identical(ds_recon, ds))
ds_recon2 <- database_schema(
  subschemas(ds),
  references(ds)
)
stopifnot(identical(ds_recon2, ds))

# can be a data frame column
data.frame(id = 1:2, schema = ds)
#>   id   schema
#> 1  1 schema a
#> 2  2 schema b

Usage

Arguments

Value

Details

See also

Examples