library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✔ ggplot2 3.3.1     ✔ purrr   0.3.4
## ✔ tibble  3.0.6     ✔ dplyr   1.0.4
## ✔ tidyr   1.1.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Regular Expression

Introduction

A regular expression, regex or regexp is a sequence of characters that define a search pattern.

There exists many versions of regular expressions. stringr of tidyverse follows the ICU standard while base R follows the PCRE standard. See Comparison of regular-expression engines

We again prefer the tidyverse packages over base functions.

Basic concepts

We will use https://regex101.com/ to illustrate the following. (Choose the Python engine to best mimic ICU standard)

Boolean “or” - gray|grey can match gray or grey
Grouping - Parentheses are used to define the scope and precedence of the operators - gr(a|e)y can match gray and grey.
A quantifier after a token, character or group specifies how often that a preceding element is allowed to occur.

quantifier

? zero or one occurrences

* zero or more occurrences

+ one or more occurrences

{n} exactly n occurrences

{n,} n or more times

{m, n} m or more times but not more than n

Examples:
- colou?r matches both “color” and “colour”.
- ab*c matches “ac”, “abc”, “abbc”, “abbbc”, and so on
- ab+c matches “abc”, “abbc”, “abbbc”, and so on, but not “ac”
Wildcard - The wildcard . matches any character except a new line.

Examples:
- a.c matches “aac”, “abc” and so on.
Anchors - ^ matches the beginning of a string (line if multiline mode) $ matches the end of a string (line if multiline mode)

Examples:
- ^abc matches “abc” but not “cabc”
- abc$ matches “abc” but not “abcd”
Bracket expression [...] matches a single character that is contained within the brackets

Examples:
- [abc] matches “a”, “b”, or “c”
- [abc123] matches “a”, “b”, “c”, “1”, “2” or “3”
- [a-z] specifies a range which matches any lowercase letter from “a” to “z”.
- [a-zA-Z0-9] matches all alphanumerics
- [\[\]] matches [ or ]
Bracket expression [^...] matches a single character that is not contained within the brackets

Examples:
- [^abc] matches any character other than “a”, “b”, or “c”
- [^\]] matches any character which is not ].
Special characters

Honable mentions | Pattern | matches| |—-|—| |\.| .| |\!| !| |\?| ?| |\\| \| |\(| (| |\{| {| |\}| }| |\[| [| |\]| ]| |\n| a new line| |\s| a space or a tab| |\S| not a space nor a tab| |\d| a digit| |\D| a non digit| |\w| a word character (includes for example CJK chars)| |\W| a non word character| |\b | word boundaries|

quantifier
?	zero or one occurrences
*	zero or more occurrences
+	one or more occurrences
{n}	exactly n occurrences
{n,}	`n` or more times
{m, n}	`m` or more times but not more than `n`

Escaping characters

In R, these two characters need to be specially treated in double quotes.

"\\" means a single backslash
"\"" means a double quote

Similarly, for single quotes - '\\' means a single backslash - '\'' means a single quote

For R 4.0 above,

r"(regex)"
r'(regex)'
R"(regex)"
R'(regex)'
r"--(regex)--"
R"--(regex)--"

Package `stringr` (tidyverse)

Manage Strings

fruit <- c("apple", "banana", "pear", "pinapple")
str_length(fruit)

## [1] 5 6 4 8

# add leading white spaces
str_pad(fruit, 10)

## [1] "     apple" "    banana" "      pear" "  pinapple"

# remove white spaces
str_trim(str_pad(fruit, 10))

## [1] "apple"    "banana"   "pear"     "pinapple"

# ...
str_trunc(fruit, 5)

## [1] "apple" "ba..." "pear"  "pi..."

Detect Matches

fruit <- c("apple", "banana", "pear", "pinapple")
# contains a?
str_detect(fruit, "a")

## [1] TRUE TRUE TRUE TRUE

# starts with a
str_detect(fruit, "^a")

## [1]  TRUE FALSE FALSE FALSE

str_starts(fruit, "a")

## [1]  TRUE FALSE FALSE FALSE

# ends with a
str_detect(fruit, "a$")

## [1] FALSE  TRUE FALSE FALSE

str_ends(fruit, "a")

## [1] FALSE  TRUE FALSE FALSE

# contains a, e, i, o or u
str_detect(fruit, "[aeiou]")

## [1] TRUE TRUE TRUE TRUE

# negate the result
str_detect(fruit, "^p", negate = TRUE)

## [1]  TRUE  TRUE FALSE FALSE

fruit <- c("apple", "banana", "pear", "pinapple")
# count the number of matches
str_count(fruit, "p")

## [1] 2 0 1 3

str_count(fruit, "p+")

## [1] 1 0 1 2

# get locations
str_locate(fruit, "[aeiou]+")

##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     2   3
## [4,]     2   2

str_locate_all(fruit, "[aeiou]+")

## [[1]]
##      start end
## [1,]     1   1
## [2,]     5   5
## 
## [[2]]
##      start end
## [1,]     2   2
## [2,]     4   4
## [3,]     6   6
## 
## [[3]]
##      start end
## [1,]     2   3
## 
## [[4]]
##      start end
## [1,]     2   2
## [2,]     4   4
## [3,]     8   8

# The pattern variable can also be vectorized
str_locate_all(fruit, c("a", "b", "p", "p"))

## [[1]]
##      start end
## [1,]     1   1
## 
## [[2]]
##      start end
## [1,]     1   1
## 
## [[3]]
##      start end
## [1,]     1   1
## 
## [[4]]
##      start end
## [1,]     1   1
## [2,]     5   5
## [3,]     6   6

Subset Strings

fruit <- c("apple", "banana", "pear", "pinapple")
# exact substring from start to end
str_sub(fruit, 1, 3)

## [1] "app" "ban" "pea" "pin"

str_sub(fruit, -3, -2)

## [1] "pl" "an" "ea" "pl"

# only select the elements that match
str_subset(fruit, "p{2,}")

## [1] "apple"    "pinapple"

# indexes that have matches
str_which(fruit, "b|r")

## [1] 2 3

shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")

# numbers
str_extract(shopping_list, R"(\d)")

## [1] "4" NA  NA  "2"

# lower case chars
str_extract(shopping_list, "[a-z]+")

## [1] "apples" "bag"    "bag"    "milk"

# lower case chars of length 1 to 4
str_extract(shopping_list, "[a-z]{1,4}")

## [1] "appl" "bag"  "bag"  "milk"

# lower case chars of length 1 to 4 with word boundary
str_extract(shopping_list, R"(\b[a-z]{1,4}\b)")

## [1] NA     "bag"  "bag"  "milk"

str_extract_all(shopping_list, "[a-z]+")

## [[1]]
## [1] "apples" "x"     
## 
## [[2]]
## [1] "bag"   "of"    "flour"
## 
## [[3]]
## [1] "bag"   "of"    "sugar"
## 
## [[4]]
## [1] "milk" "x"

str_extract_all(shopping_list, "[a-z]+", simplify = TRUE)

##      [,1]     [,2] [,3]   
## [1,] "apples" "x"  ""     
## [2,] "bag"    "of" "flour"
## [3,] "bag"    "of" "sugar"
## [4,] "milk"   "x"  ""

strings <- c(
  " 219 733 8965",
  "329-293-8753 ",
  "banana", 
  "239 923 8115 and 842 566 4692",
  "Work: 579-499-7527",
  "$1000",
  "Home: 543.355.3679"
)

phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

# only the matched pattern
str_extract(strings, phone)

## [1] "219 733 8965" "329-293-8753" NA             "239 923 8115" "579-499-7527"
## [6] NA             "543.355.3679"

str_extract_all(strings, phone)

## [[1]]
## [1] "219 733 8965"
## 
## [[2]]
## [1] "329-293-8753"
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "239 923 8115" "842 566 4692"
## 
## [[5]]
## [1] "579-499-7527"
## 
## [[6]]
## character(0)
## 
## [[7]]
## [1] "543.355.3679"

# with subgroups
str_match(strings, phone)

##      [,1]           [,2]  [,3]  [,4]  
## [1,] "219 733 8965" "219" "733" "8965"
## [2,] "329-293-8753" "329" "293" "8753"
## [3,] NA             NA    NA    NA    
## [4,] "239 923 8115" "239" "923" "8115"
## [5,] "579-499-7527" "579" "499" "7527"
## [6,] NA             NA    NA    NA    
## [7,] "543.355.3679" "543" "355" "3679"

str_match_all(strings, phone)

## [[1]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "219 733 8965" "219" "733" "8965"
## 
## [[2]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "329-293-8753" "329" "293" "8753"
## 
## [[3]]
##      [,1] [,2] [,3] [,4]
## 
## [[4]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "239 923 8115" "239" "923" "8115"
## [2,] "842 566 4692" "842" "566" "4692"
## 
## [[5]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "579-499-7527" "579" "499" "7527"
## 
## [[6]]
##      [,1] [,2] [,3] [,4]
## 
## [[7]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "543.355.3679" "543" "355" "3679"

Mutate Strings

fruit <- c("apple", "banana", "pear", "pinapple")
str_sub(fruit, 1, 5) <- "APPLE"
fruit

## [1] "APPLE"    "APPLEa"   "APPLE"    "APPLEple"

fruits <- c("one apple", "two pears", "three bananas")
# change the first a, e, i, o and u to -
str_replace(fruits, "[aeiou]", "-")

## [1] "-ne apple"     "tw- pears"     "thr-e bananas"

# change all a, e, i, o and u to -
str_replace_all(fruits, "[aeiou]", "-")

## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

# apply a function to the matches
str_replace_all(fruits, "[aeiou]", toupper)

## [1] "OnE ApplE"     "twO pEArs"     "thrEE bAnAnAs"

# remove all a, e, i, o and u
str_replace_all(fruits, "[aeiou]", "")

## [1] "n ppl"    "tw prs"   "thr bnns"

str_remove_all(fruits, "[aeiou]")

## [1] "n ppl"    "tw prs"   "thr bnns"

References of the form \0 (the full match), \1, \2, etc will be replaced with the contents of the respective matched group

fruits <- c("one apple", "two pears", "three bananas three pineapples")
str_match_all(fruits, "(\\w+) (\\w+)")

## [[1]]
##      [,1]        [,2]  [,3]   
## [1,] "one apple" "one" "apple"
## 
## [[2]]
##      [,1]        [,2]  [,3]   
## [1,] "two pears" "two" "pears"
## 
## [[3]]
##      [,1]               [,2]    [,3]        
## [1,] "three bananas"    "three" "bananas"   
## [2,] "three pineapples" "three" "pineapples"

str_replace_all(fruits, "(\\w+) (\\w+)", "\\1 \\1 \\2")

## [1] "one one apple"                             
## [2] "two two pears"                             
## [3] "three three bananas three three pineapples"

strings <- c(
  "Work: 219 733 8965",
  "Mobile: 579-499-7527",
  "Home: 543.355.3679"
)

phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

str_match_all(strings, phone)

## [[1]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "219 733 8965" "219" "733" "8965"
## 
## [[2]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "579-499-7527" "579" "499" "7527"
## 
## [[3]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "543.355.3679" "543" "355" "3679"

str_replace_all(strings, phone, "(\\1)-\\2-\\3")

## [1] "Work: (219)-733-8965"   "Mobile: (579)-499-7527" "Home: (543)-355-3679"

# apply replacement mutliple times
str_replace_all("foobar", c("foo" = "hello", "bar" = "world"))

## [1] "helloworld"

Changes cases

str_to_lower(c("one Apple", "tWo BANANAs", "THREE orangeS"))

## [1] "one apple"     "two bananas"   "three oranges"

str_to_upper(c("one Apple", "tWo BANANAs", "THREE orangeS"))

## [1] "ONE APPLE"     "TWO BANANAS"   "THREE ORANGES"

str_to_title(c("one Apple", "tWo BANANAs", "THREE orangeS"))

## [1] "One Apple"     "Two Bananas"   "Three Oranges"

Join and split

str_c("apple", "pie")

## [1] "applepie"

str_c(letters, LETTERS)

##  [1] "aA" "bB" "cC" "dD" "eE" "fF" "gG" "hH" "iI" "jJ" "kK" "lL" "mM" "nN" "oO"
## [16] "pP" "qQ" "rR" "sS" "tT" "uU" "vV" "wW" "xX" "yY" "zZ"

str_c(letters, collapse = "")

## [1] "abcdefghijklmnopqrstuvwxyz"

str_c(letters, LETTERS, collapse = "")

## [1] "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ"

str_flatten(letters) # faster than str_c(letters, collapse = "") marginally

## [1] "abcdefghijklmnopqrstuvwxyz"

fruits <- c(
  "apples and oranges and pears and bananas",
  "pineapples and mangos and guavas"
)

str_split(fruits, " and ")

## [[1]]
## [1] "apples"  "oranges" "pears"   "bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"

str_split(fruits, " and ", simplify = TRUE)

##      [,1]         [,2]      [,3]     [,4]     
## [1,] "apples"     "oranges" "pears"  "bananas"
## [2,] "pineapples" "mangos"  "guavas" ""

str_split(fruits, " and ", n = 2)

## [[1]]
## [1] "apples"                        "oranges and pears and bananas"
## 
## [[2]]
## [1] "pineapples"        "mangos and guavas"

str_split(fruits, " and ", n = 3, simplify = TRUE)

##      [,1]         [,2]      [,3]               
## [1,] "apples"     "oranges" "pears and bananas"
## [2,] "pineapples" "mangos"  "guavas"

# a shorthand for str_split(..., n, simplify = TRUE)
str_split_fixed(fruits, " and ", n = 3)

##      [,1]         [,2]      [,3]               
## [1,] "apples"     "oranges" "pears and bananas"
## [2,] "pineapples" "mangos"  "guavas"

Glue String

name <- c("John", "Peter")
age <- c(23, 17)
# get variables from globals
str_glue("{name} is {age}")

## John is 23
## Peter is 17

# get variables from arguments
str_glue("{name} is {age}", name = "Anna", age = 43)

## Anna is 43

mtcars %>%
  group_by(cyl) %>%
  summarize(m = mean(mpg)) %>%
  str_glue_data("A {cyl}-cylinder car has an average mpg of {round(m, 2)}.")

## A 4-cylinder car has an average mpg of 26.66.
## A 6-cylinder car has an average mpg of 19.74.
## A 8-cylinder car has an average mpg of 15.1.

Order Strings

names <- c("John", "Albert", "Peter", "Charles")
str_order(names)

## [1] 2 4 1 3

str_sort(names)

## [1] "Albert"  "Charles" "John"    "Peter"

str_sort(names, decreasing = TRUE)

## [1] "Peter"   "John"    "Charles" "Albert"

files <- c("file10", "file2", "file5", "file1")
str_sort(files)

## [1] "file1"  "file10" "file2"  "file5"

# more natrual order
str_sort(files, numeric = TRUE)

## [1] "file1"  "file2"  "file5"  "file10"

str_order(files, numeric = TRUE)

## [1] 4 2 3 1

Pattern interpretation

Patterns in stringr functions are interpreted as regex in default, you could use fixed or regex to change the default behavior.

strings <- c("abb", "a.b")

str_detect(strings, "a.b")

## [1] TRUE TRUE

str_detect(strings, fixed("a.b"))

## [1] FALSE  TRUE

str_detect(strings, fixed("A.B", ignore_case = TRUE))

## [1] FALSE  TRUE

str_match_all("abaa\na", "^a")

## [[1]]
##      [,1]
## [1,] "a"

str_match_all("abaa\na", regex("^a", multiline = TRUE))

## [[1]]
##      [,1]
## [1,] "a" 
## [2,] "a"

str_match_all("abaa\na", regex("^A", ignore_case = TRUE, multiline = TRUE))

## [[1]]
##      [,1]
## [1,] "a" 
## [2,] "a"

More advanced topics of regex

I assume that you are now comfortable with the basic regex. Let’s talk about some more advanced topics.

non-capturing groups

a capturing group could be created by using a pair of parentheses (<regex>) and a non-captureing group can be created by using (?:<regex>).

# with captureing groups
str_match("12mb", "([0-9]+)([a-z]+)")

##      [,1]   [,2] [,3]
## [1,] "12mb" "12" "mb"

# with non-capturing groups
str_match(c("12mb", "12MB"), "(?:[0-9]+)(?:[a-z]+|[A-Z]+)")

##      [,1]  
## [1,] "12mb"
## [2,] "12MB"

atomic groups

an atomic group prevents the regex engine from backtracking back into the group

strings <- c("abc", "abcc")
str_extract(strings, "a(?>bc|b)c")

## [1] NA     "abcc"

lazy quantifier

quantifier
??	zero or one occurrence, lazy
*?	zero or more occurrences, lazy
+?	one or more occurrences, lazy
{n,}?	`n` or more times, lazy
{m, n}?	`m` or more times but not more than `n`, lazy

strings <- c("abc", "acb")
# it is greedy
str_extract(strings, "ab?c?")

## [1] "abc" "ac"

# it is lazy
str_extract(strings, "ab??c?")

## [1] "a"  "ac"

strings <- "acbcbbc"
# it is greedy, longest match
str_extract(strings, "a.*c")

## [1] "acbcbbc"

# it is lazy, shortest match
str_extract(strings, "a.*?c")

## [1] "ac"

strings <- "acbcbbc"
# it is greedy, longest match
str_extract(strings, "a.+c")

## [1] "acbcbbc"

# it is lazy, shortest match
str_extract(strings, "a.+?c")

## [1] "acbc"

strings <- "acbcbbc"
str_extract(strings, "a.{2,}c")

## [1] "acbcbbc"

str_extract(strings, "a.{2,}?c")

## [1] "acbc"

Possessive quantifier (does not backtrack)

quantifier

?+ zero or one occurrence, posessive

*+ zero or more occurrences, posessive

++ one or more occurrences, posessive

quantifier
?+	zero or one occurrence, posessive
*+	zero or more occurrences, posessive
++	one or more occurrences, posessive

strings <- c("apple", "pineapple", "pinepineapple")
str_extract(strings, "(pine)?pineapple")

## [1] NA              "pineapple"     "pinepineapple"

str_extract(strings, "(pine)?+pineapple")

## [1] NA              NA              "pinepineapple"

str_extract(strings, "(pine)*pineapple")

## [1] NA              "pineapple"     "pinepineapple"

str_extract(strings, "(pine)*+pineapple")

## [1] NA NA NA

back reference

back reference is used to reference a previous matched group \1, \2, \3 etc..

strings <- c(":abc:", "?foo?", "*bar*")
str_match(strings, "(.)[a-z]+\\1")

##      [,1]    [,2]
## [1,] ":abc:" ":" 
## [2,] "?foo?" "?" 
## [3,] "*bar*" "*"

look ahead
- positive look ahead

# it matches only the second `t` because the second `t` is followed by `s`
str_locate_all("streets", "t(?=s)")

## [[1]]
##      start end
## [1,]     6   6

negative look ahead

# it matches only the first `t` because the second `t` is followed by `s`
str_locate_all("streets", "t(?!s)")

## [[1]]
##      start end
## [1,]     2   2

look behind
- positive look behind

# it matches only the first `t` because the second `t` follows `s`
str_locate_all("streets", "(?<=s)t")

## [[1]]
##      start end
## [1,]     2   2

negative look behind

# it matches only the second `t` because the first `t` follows `s`
str_locate_all("streets", "(?<!s)t")

## [[1]]
##      start end
## [1,]     6   6

# it also works if the look behind pattern is a (bounded) regex
# (base R functions do not support regex in look behind pattern)
str_locate_all("twisty streets", "(?<![se]{1,100})t")

## [[1]]
##      start end
## [1,]     1   1

recursion

We try to find the highest level of paranthesse in ((((x)))), ((x)((y))(z)) (x) (y)

# what doesn't work
strings <- c("((((x))))", "((x) ((y)) (z))", "(x) (y)")
str_extract_all(strings, "\\([^(]*\\)")

## [[1]]
## [1] "(x))))"
## 
## [[2]]
## [1] "(x)"  "(y))" "(z))"
## 
## [[3]]
## [1] "(x)" "(y)"

str_extract_all(strings, "\\([^(]*?\\)")

## [[1]]
## [1] "(x)"
## 
## [[2]]
## [1] "(x)" "(y)" "(z)"
## 
## [[3]]
## [1] "(x)" "(y)"

# how about back reference the entire group? no, it doesnt work
str_extract_all(strings, "(\\((?:\\1*|[^)]*?)+\\))")

## [[1]]
## [1] "((((x)"
## 
## [[2]]
## [1] "((x)" "((y)" "(z)" 
## 
## [[3]]
## [1] "(x)" "(y)"

stringr’s functions don’t support recusions, we will need to use base R functions (which are difficult to use and slower). Luckily, there is rematch2 which provides nice wrapping functions to base R functions (still, it is slower).

library(rematch2)
re_match_all(strings, "\\((?:(?0)*|[^)]*?)+\\)") %>%
  pull(.match)

## [[1]]
## [1] "((((x))))"
## 
## [[2]]
## [1] "((x) ((y)) (z))"
## 
## [[3]]
## [1] "(x)" "(y)"

Three games to learn regex

Reference

Online regex tester https://regex101.com/
R for Data Science https://r4ds.had.co.nz/strings.html

Regex