Title: | Blocking for Record Linkage |
---|---|
Description: | An implementation of the blocking algorithm KLSH in Steorts, Ventura, Sadinle, Fienberg (2014) <DOI:10.1007/978-3-319-11257-2_20>, which is a k-means variant of locality sensitive hashing. The method is illustrated with examples and a vignette. |
Authors: | Rebecca Steorts [aut, cre] |
Maintainer: | Rebecca Steorts <[email protected]> |
License: | GPL-3 |
Version: | 0.1.0 |
Built: | 2025-02-02 03:11:16 UTC |
Source: | https://github.com/cleanzr/klsh |
Function to convert a record into a bag of tokens with a fieldwise flag
bag_of_word_ify(record, k, fieldwise = FALSE)
bag_of_word_ify(record, k, fieldwise = FALSE)
record |
String or record |
k |
Parameter k, which is the number of shingle, tokens, or grams to break the string into |
fieldwise |
Flag where the defalt setting to include the record as the entire string |
Computes the bag of tokens for a string
data(RLdata500) data.500 <- RLdata500[-c(2,4)] bag_of_word_ify(data.500[1,c(-2)],k=2) bag_of_word_ify(data.500[300,c(-2)],k=2) names(bag_of_word_ify(data.500[300,c(-2)],k=2))
data(RLdata500) data.500 <- RLdata500[-c(2,4)] bag_of_word_ify(data.500[1,c(-2)],k=2) bag_of_word_ify(data.500[300,c(-2)],k=2) names(bag_of_word_ify(data.500[300,c(-2)],k=2))
Function that reduces a bag of words into a signature matrix using multiple random projections
bag_signatures(sack_of_bags, p, weighting_table)
bag_signatures(sack_of_bags, p, weighting_table)
sack_of_bags |
Sack of bag of words |
p |
Number of random projections p |
weighting_table |
Weighting table (inverse document frequency) |
Computes a signature matrix using multiple random projections and the inverse document frequency weights
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sack <- sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2) idf <- calc_idf(sack) bag_signatures(sack, p=5, idf)
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sack <- sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2) idf <- calc_idf(sack) bag_signatures(sack, p=5, idf)
Returns the block ids associated with a blocking method.
block.ids.from.blocking(blocking)
block.ids.from.blocking(blocking)
blocking |
A list of the blocks. |
A list of the blocks ids that corresponds to each block
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) block.ids.from.blocking(klsh.blocks)
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) block.ids.from.blocking(klsh.blocks)
Function to calculate the inverse document frequency given a shingled bag of words
calc_idf(sack_of_bags)
calc_idf(sack_of_bags)
sack_of_bags |
Sack of bag of words |
Computes the inverse document frequency for a bag of words
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sack <- sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2) (idf <- calc_idf(sack)) match(names(sack[[1]]), names(idf))
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sack <- sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2) (idf <- calc_idf(sack)) match(names(sack[[1]]), names(idf))
Perform evaluations (recall) for blocking.
confusion.from.blocking(blocking, true_ids, recall.only = FALSE)
confusion.from.blocking(blocking, true_ids, recall.only = FALSE)
blocking |
A list of the blocks |
true_ids |
The true identifiers for comparisons |
recall.only |
Flag that when true only prints the recall, otherwise prints many evaluation metrics in a list |
A vector of that returns the recall and the precision
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) confusion.from.blocking(klsh.blocks, identity.RLdata500) confusion.from.blocking(klsh.blocks, identity.RLdata500, recall.only=TRUE)
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) confusion.from.blocking(klsh.blocks, identity.RLdata500) confusion.from.blocking(klsh.blocks, identity.RLdata500, recall.only=TRUE)
Function that reduces a bag of words into a signature matrix using multiple random projections
klsh(r.set, p, num.blocks, k, fieldwise = FALSE, quiet = TRUE)
klsh(r.set, p, num.blocks, k, fieldwise = FALSE, quiet = TRUE)
r.set |
Set of records |
p |
Number of random projections p |
num.blocks |
The total number of desired blocks |
k |
The total number of tokens |
fieldwise |
Flag with default FALSE |
quiet |
Flag to turn on printed progress, default to TRUE |
The blocks from performing KLSH
data(RLdata500) data.500 <- RLdata500[-c(2,4)] klsh.blocks <- klsh(data.500, p=20, num.blocks=5, k=2)
data(RLdata500) data.500 <- RLdata500[-c(2,4)] klsh.blocks <- klsh(data.500, p=20, num.blocks=5, k=2)
Returns the reduction ratio associated with a blocking method
reduction.ratio(block.labels)
reduction.ratio(block.labels)
block.labels |
A list of the blocks labels. |
The reduction ratio
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) block.ids <- block.ids.from.blocking(klsh.blocks) reduction.ratio(block.ids)
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) block.ids <- block.ids.from.blocking(klsh.blocks) reduction.ratio(block.ids)
Returns the reduction ratio associated with a blocking method
reduction.ratio.from.blocking(blocking)
reduction.ratio.from.blocking(blocking)
blocking |
The actual blocks |
The reduction ratio
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) reduction.ratio.from.blocking(klsh.blocks)
data("RLdata500") klsh.blocks <- klsh(RLdata500, p=20, num.blocks=5, k=2) reduction.ratio.from.blocking(klsh.blocks)
Function that generates unit random vectors and takes (weighted) projections onto the random unit vectors given a bag of words
rproject_bags(sack_of_bags, weighting_table)
rproject_bags(sack_of_bags, weighting_table)
sack_of_bags |
Sack of bag of words |
weighting_table |
Weighting table (inverse document frequency) |
Computes the inverse document frequency for a bag of words
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sack <- sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2) idf <- calc_idf(sack) match(names(sack[[1]]), names(idf)) rproject_bags(sack, idf)
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sack <- sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2) idf <- calc_idf(sack) match(names(sack[[1]]), names(idf)) rproject_bags(sack, idf)
Function to convert all records into a bag of tokens
sacks_of_bags_of_words(r.set, k, fieldwise = FALSE)
sacks_of_bags_of_words(r.set, k, fieldwise = FALSE)
r.set |
Record set |
k |
Parameter k, which is the number of shingle, tokens, or grams to break the string into |
fieldwise |
Flag where the defalt setting to include the record as the entire string |
Computes the bag of tokens for a record set
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2)
data(RLdata500) data.500 <- RLdata500[-c(2,4)] sacks_of_bags_of_words(data.500[1:3,c(-2)],k=2)
Function to token a string into its k components
tokenify(string, k)
tokenify(string, k)
string |
A string or record |
k |
A parameter k, which is the number of shingle, tokens, or grams to break the string into |
Computes the tokenized or grammed version of a string
tokenify("Alexander",2) tokenify("Alexander Smith", 2)
tokenify("Alexander",2) tokenify("Alexander Smith", 2)