library(tidyverse)
library(DBI)

What is a database? It is what google says

a structured set of data held in a computer, especially one that is accessible in various ways.

A relational database is a type of database that stores and provides access to data points that are related to one another. Relation databases are administrated by a Relational Database Management System (RDBMS). The data in RDBMS is stored in database objects called tables. A table is a collection of related data entries and it consists of columns and rows.

There are many RDBMS - MySQL (owned by Oracle) - MariaDB (open source version MySQL) - PostgreSQL (open source, ANSI) - SQL Server (microsoft) - SQLite (open source, single file)

What is SQL? Structured Query Language (or SQL) is a standard language for accessing and manipulating relational databases. However, each RDMBS may have their own extension of the SQL language and their implmentation may vary too.

Connect to a database

We are going to use a popular database called Sakila https://www.jooq.org/sakila.

The Sakila database is a nicely normalised schema modelling a DVD rental store, featuring things like films, actors, film-actor relationships, and a central inventory table that connects films, stores, and rentals.

In the following, we are going to use sqlite, postgresql and MySql.

SQLite

The database is called sakila.sqlite. You could either git clone from lectures repo or download using the code

if (!file.exists("sakila.sqlite") || file.size("sakila.sqlite") == 0) {
  download.file(
    "https://github.com/ucdavis-sta141b-2021-winter/sta141b-lectures/raw/master/ch02-sql/sakila.sqlite", 
    destfile = "sakila.sqlite")
}

The file format is .sqlite which is one of the very common relational database formats, especially for simple problems.

sakila_lite <- dbConnect(RSQLite::SQLite(), dbname = "sakila.sqlite")
sakila_lite %>% dbListTables()
##  [1] "actor"                  "address"                "category"              
##  [4] "city"                   "country"                "customer"              
##  [7] "customer_list"          "film"                   "film_actor"            
## [10] "film_category"          "film_list"              "film_text"             
## [13] "inventory"              "language"               "payment"               
## [16] "rental"                 "sales_by_film_category" "sales_by_store"        
## [19] "sqlite_sequence"        "staff"                  "staff_list"            
## [22] "store"
sakila_lite %>% dbListFields("actor")  # column names
## [1] "actor_id"    "first_name"  "last_name"   "last_update"

Postgresql

I have also uploaded the Sakila database to a postgres server owned by the department. (You’ll need to either on the campus or over UCD vpn to connect to it)

sakila_psql <- dbConnect(RPostgres::Postgres(),
  dbname = "sakila",
  user = "psqluser", password = "secret", host = "alan.ucdavis.edu"
)
sakila_psql %>% dbListTables()
##  [1] "inventory"                  "staff"                     
##  [3] "payment_p2017_05"           "payment_p2017_06"          
##  [5] "payment"                    "actor"                     
##  [7] "store"                      "payment_p2017_02"          
##  [9] "rental"                     "language"                  
## [11] "payment_p2017_03"           "address"                   
## [13] "payment_p2017_01"           "film"                      
## [15] "country"                    "city"                      
## [17] "payment_p2017_04"           "film_actor"                
## [19] "category"                   "film_category"             
## [21] "customer"                   "actor_info"                
## [23] "customer_list"              "film_list"                 
## [25] "nicer_but_slower_film_list" "sales_by_film_category"    
## [27] "sales_by_store"             "staff_list"

MySQL

MySQL (MariaDB) is another popular RDBMS.

sakila_mysql <- dbConnect(RMariaDB::MariaDB(),
  dbname = "DATA",
  user = "student", password = "141b@2020stats", host = "alan.ucdavis.edu"
)
sakila_mysql %>% dbListTables()
##  [1] "actor"                      "address"                   
##  [3] "category"                   "city"                      
##  [5] "country"                    "customer"                  
##  [7] "customer_list"              "film"                      
##  [9] "film_actor"                 "film_category"             
## [11] "film_list"                  "film_text"                 
## [13] "inventory"                  "language"                  
## [15] "nicer_but_slower_film_list" "payment"                   
## [17] "rental"                     "sales_by_film_category"    
## [19] "sales_by_store"             "staff"                     
## [21] "staff_list"                 "store"

How not to use SQL?

dplyr provides an excellent interface for users without any SQL background to query databases.

film <- sakila_lite %>%
  tbl("film")
# to download all observations
film2 <- film %>% 
  collect()

film relies on the connection

# dbDisconnect(sakila_lite)

sakila_lite %>% tbl("film") creates a virtual table rather loading the whole table into memory.

class(film)
## [1] "tbl_SQLiteConnection" "tbl_dbi"              "tbl_sql"             
## [4] "tbl_lazy"             "tbl"
class(film2)
## [1] "tbl_df"     "tbl"        "data.frame"
sakila_lite %>% dbListFields("film")
##  [1] "film_id"              "title"                "description"         
##  [4] "release_year"         "language_id"          "original_language_id"
##  [7] "rental_duration"      "rental_rate"          "length"              
## [10] "replacement_cost"     "rating"               "special_features"    
## [13] "last_update"
film90 <- film %>%
  filter(length > 90) %>% 
  collect()
class(film90)
## [1] "tbl_df"     "tbl"        "data.frame"
film90
## # A tibble: 675 x 13
##    film_id title description release_year language_id original_langua…
##      <int> <chr> <chr>       <chr>              <int>            <int>
##  1       4 AFFA… A Fanciful… 2006                   1               NA
##  2       5 AFRI… A Fast-Pac… 2006                   1               NA
##  3       6 AGEN… A Intrepid… 2006                   1               NA
##  4       9 ALAB… A Thoughtf… 2006                   1               NA
##  5      11 ALAM… A Boring E… 2006                   1               NA
##  6      12 ALAS… A Fanciful… 2006                   1               NA
##  7      13 ALI … A Action-P… 2006                   1               NA
##  8      14 ALIC… A Emotiona… 2006                   1               NA
##  9      16 ALLE… A Fast-Pac… 2006                   1               NA
## 10      19 AMAD… A Emotiona… 2006                   1               NA
## # … with 665 more rows, and 7 more variables: rental_duration <int>,
## #   rental_rate <dbl>, length <int>, replacement_cost <dbl>, rating <chr>,
## #   special_features <chr>, last_update <chr>

Sakila queries

https://datamastery.gitlab.io/exercises/sakila-queries.html

scarlett <- sakila_lite %>%
  tbl("actor") %>%
  filter(str_to_lower(first_name) == str_to_lower("Scarlett")) %>%
  collect()

Suppose we want to make the result a bit more beautiful.

scarlett %>% 
  mutate(
    first_name = str_to_title(first_name), 
    last_name = str_to_title(last_name))
## # A tibble: 2 x 4
##   actor_id first_name last_name last_update        
##      <int> <chr>      <chr>     <chr>              
## 1       81 Scarlett   Damon     2019-04-11 18:11:48
## 2      124 Scarlett   Bening    2019-04-11 18:11:48

Note: mutation is done locally in R.

It is possible to do mutation on the remote server (Postgresql sevrer)

sakila_psql %>%
  tbl("actor") %>%
  filter(str_to_lower(first_name) == str_to_lower("Scarlett")) %>%
  mutate(
    first_name = str_to_title(first_name), 
    last_name = str_to_title(last_name)) %>% 
  collect()
## # A tibble: 2 x 4
##   actor_id first_name last_name last_update        
##      <int> <chr>      <chr>     <dttm>             
## 1       81 Scarlett   Damon     2017-02-15 09:34:33
## 2      124 Scarlett   Bening    2017-02-15 09:34:33

Note: SQLite and Mysql don’t support transforming title case but Postgresql does.

sakila_lite %>%
  tbl("actor") %>%
  filter(str_to_lower(last_name) == "johansson") %>%
  collect()
## # A tibble: 3 x 4
##   actor_id first_name last_name last_update        
##      <int> <chr>      <chr>     <chr>              
## 1        8 MATTHEW    JOHANSSON 2019-04-11 18:11:48
## 2       64 RAY        JOHANSSON 2019-04-11 18:11:48
## 3      146 ALBERT     JOHANSSON 2019-04-11 18:11:48
sakila_lite %>%
  tbl("actor") %>%
  summarize(n = n_distinct(last_name)) %>% 
  collect()
## # A tibble: 1 x 1
##       n
##   <int>
## 1   121
sakila_lite %>%
  tbl("actor") %>%
  count(last_name) %>%
  filter(n == 1) %>%
  collect()
## # A tibble: 66 x 2
##    last_name     n
##    <chr>     <int>
##  1 ASTAIRE       1
##  2 BACALL        1
##  3 BALE          1
##  4 BALL          1
##  5 BARRYMORE     1
##  6 BASINGER      1
##  7 BERGEN        1
##  8 BERGMAN       1
##  9 BIRCH         1
## 10 BLOOM         1
## # … with 56 more rows
sakila_lite %>%
  tbl("actor") %>%
  count(last_name) %>%
  filter(n > 1) %>%
  collect()
## # A tibble: 55 x 2
##    last_name     n
##    <chr>     <int>
##  1 AKROYD        3
##  2 ALLEN         3
##  3 BAILEY        2
##  4 BENING        2
##  5 BERRY         3
##  6 BOLGER        2
##  7 BRODY         2
##  8 CAGE          2
##  9 CHASE         2
## 10 CRAWFORD      2
## # … with 45 more rows
actor <- sakila_lite %>% tbl("actor")
sakila_lite %>%
  tbl("film_actor") %>%
  count(actor_id) %>%
  arrange(desc(n)) %>%
  head(1) %>%
  inner_join(actor, by = "actor_id") %>%
  collect()
## # A tibble: 1 x 5
##   actor_id     n first_name last_name last_update        
##      <int> <int> <chr>      <chr>     <chr>              
## 1      107    42 GINA       DEGENERES 2019-04-11 18:11:48
sakila_lite %>%
  tbl("film") %>%
  summarize(m = mean(length)) %>%
  collect()
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## # A tibble: 1 x 1
##       m
##   <dbl>
## 1  115.
sakila_lite %>%
  tbl("film") %>%
  left_join(tbl(sakila_lite, "film_category"), by = "film_id") %>%
  group_by(category_id) %>%
  summarize(mean_length = mean(length)) %>%
  left_join(tbl(sakila_lite, "category"), by = "category_id") %>%
  select(name, mean_length) %>%
  collect()
## # A tibble: 16 x 2
##    name        mean_length
##    <chr>             <dbl>
##  1 Action             112.
##  2 Animation          111.
##  3 Children           110.
##  4 Classics           112.
##  5 Comedy             116.
##  6 Documentary        109.
##  7 Drama              121.
##  8 Family             115.
##  9 Foreign            122.
## 10 Games              128.
## 11 Horror             112.
## 12 Music              114.
## 13 New                111.
## 14 Sci-Fi             108.
## 15 Sports             128.
## 16 Travel             113.
uz <- sakila_lite %>%
  tbl("film") %>%
  filter(str_to_lower(title) == str_to_lower("Unforgiven Zoolander")) %>%
  select(film_id)
all_inventories_of_store1 <- sakila_lite %>%
  tbl("inventory") %>%
  filter(store_id == 1) %>%
  select(film_id, inventory_id)
not_yet_returned <- sakila_lite %>%
  tbl("rental") %>%
  filter(is.na(return_date)) %>%
  select(inventory_id)
uz %>%
  inner_join(all_inventories_of_store1, by = "film_id") %>%
  anti_join(not_yet_returned, by = "inventory_id") %>%
  count() %>%
  collect()
## # A tibble: 1 x 1
##       n
##   <int>
## 1     2

SQL

We just see some example queries of a relational database. Behind the scene, we are using a language called SQL. For example, in the last query, the SQL used is

uz %>%
  inner_join(all_inventories_of_store1) %>%
  anti_join(not_yet_returned) %>%
  count() %>%
  show_query()
## Joining, by = "film_id"
## Joining, by = "inventory_id"
## <SQL>
## SELECT COUNT() AS `n`
## FROM (SELECT * FROM (SELECT `LHS`.`film_id` AS `film_id`, `RHS`.`inventory_id` AS `inventory_id`
## FROM (SELECT `film_id`
## FROM `film`
## WHERE (LOWER(`title`) = LOWER('Unforgiven Zoolander'))) AS `LHS`
## INNER JOIN (SELECT `film_id`, `inventory_id`
## FROM `inventory`
## WHERE (`store_id` = 1.0)) AS `RHS`
## ON (`LHS`.`film_id` = `RHS`.`film_id`)
## ) AS `LHS`
## WHERE NOT EXISTS (
##   SELECT 1 FROM (SELECT `inventory_id`
## FROM `rental`
## WHERE (((`return_date`) IS NULL))) AS `RHS`
##   WHERE (`LHS`.`inventory_id` = `RHS`.`inventory_id`)
## ))

Why learning SQL when there is dplyr?

In R, a SQL query can be made by using dbGetQuery which counts number of rows in rental.

sakila_lite %>%
  dbGetQuery("SELECT COUNT(*) AS `n` FROM `rental`")
##       n
## 1 16044

We could also make SQL query by sql block. In here, we are using the connection sakila_lite. The result will be printed directly.

SELECT COUNT(*) AS `n` FROM `rental`
1 records
n
16044

In we need the output, set output.var to rental_count

SELECT COUNT(*) AS `n` FROM `rental`;

The output could be later used in R blocks

rental_count
##       n
## 1 16044

For comparison, in Python, we use

import sqlite3
sakila = sqlite3.connect('sakila.sqlite')
c = sakila.cursor()
c.execute("SELECT COUNT(*) AS `n` FROM `rental`;")
c.fetchall()

Identifier quotation

SQLite supports both double quotes (which is the standard) and backticks to quote identifiers (table and column names). Backticks are used in another popular database MySQL. Double quotes are used in Postgresql. It is always a good practice to quote the identifiers.

-- MySQL
SELECT COUNT(*) AS `n` from `actor`;
1 records
n
200
-- PostgresQL (ANSI standard)
SELECT COUNT(*) AS "n" from "actor";
1 records
n
200

SQLite supports both styles.

SELECT COUNT(*) AS `n` from `actor`;
1 records
n
200

SELECT

The SELECT statement is pretty much the select() function in dplyr.

SELECT * FROM "rental";
Displaying records 1 - 10
rental_id rental_date inventory_id customer_id return_date staff_id last_update
1 2005-05-24 22:53:30.000 367 130 2005-05-26 22:04:30.000 1 2019-04-11 18:11:49
2 2005-05-24 22:54:33.000 1525 459 2005-05-28 19:40:33.000 1 2019-04-11 18:11:49
3 2005-05-24 23:03:39.000 1711 408 2005-06-01 22:12:39.000 1 2019-04-11 18:11:49
4 2005-05-24 23:04:41.000 2452 333 2005-06-03 01:43:41.000 2 2019-04-11 18:11:49
5 2005-05-24 23:05:21.000 2079 222 2005-06-02 04:33:21.000 1 2019-04-11 18:11:49
6 2005-05-24 23:08:07.000 2792 549 2005-05-27 01:32:07.000 1 2019-04-11 18:11:49
7 2005-05-24 23:11:53.000 3995 269 2005-05-29 20:34:53.000 2 2019-04-11 18:11:49
8 2005-05-24 23:31:46.000 2346 239 2005-05-27 23:33:46.000 2 2019-04-11 18:11:49
9 2005-05-25 00:00:40.000 2580 126 2005-05-28 00:22:40.000 1 2019-04-11 18:11:49
10 2005-05-25 00:02:21.000 1824 399 2005-05-31 22:44:21.000 2 2019-04-11 18:11:49
SELECT LOWER("last_name") AS "family_name" FROM "actor";
Displaying records 1 - 10
family_name
akroyd
akroyd
akroyd
allen
allen
allen
astaire
bacall
bailey
bailey

For comparison,

sakila_lite %>%
  tbl("actor") %>%
  transmute(family_name = str_to_lower(last_name)) %>%
  collect()
## # A tibble: 200 x 1
##    family_name
##    <chr>      
##  1 akroyd     
##  2 akroyd     
##  3 akroyd     
##  4 allen      
##  5 allen      
##  6 allen      
##  7 astaire    
##  8 bacall     
##  9 bailey     
## 10 bailey     
## # … with 190 more rows
sakila_lite %>%
  tbl("actor") %>%
  transmute(family_name = str_to_lower(last_name)) %>%
  show_query()
## <SQL>
## SELECT LOWER(`last_name`) AS `family_name`
## FROM `actor`

To select every column or some columns,

SELECT * FROM "actor";
Displaying records 1 - 10
actor_id first_name last_name last_update
1 PENELOPE GUINESS 2019-04-11 18:11:48
2 NICK WAHLBERG 2019-04-11 18:11:48
3 ED CHASE 2019-04-11 18:11:48
4 JENNIFER DAVIS 2019-04-11 18:11:48
5 JOHNNY LOLLOBRIGIDA 2019-04-11 18:11:48
6 BETTE NICHOLSON 2019-04-11 18:11:48
7 GRACE MOSTEL 2019-04-11 18:11:48
8 MATTHEW JOHANSSON 2019-04-11 18:11:48
9 JOE SWANK 2019-04-11 18:11:48
10 CHRISTIAN GABLE 2019-04-11 18:11:48
SELECT "rental_id", "last_update" AS "last_update_time" FROM "rental";
Displaying records 1 - 10
rental_id last_update_time
1 2019-04-11 18:11:49
2 2019-04-11 18:11:49
3 2019-04-11 18:11:49
4 2019-04-11 18:11:49
5 2019-04-11 18:11:49
6 2019-04-11 18:11:49
7 2019-04-11 18:11:49
8 2019-04-11 18:11:49
9 2019-04-11 18:11:49
10 2019-04-11 18:11:49

ORDER BY Clause

It is equivalent to arrange() in dplyr

SELECT "film_id", "title" FROM "film" ORDER BY "title";
Displaying records 1 - 10
film_id title
1 ACADEMY DINOSAUR
2 ACE GOLDFINGER
3 ADAPTATION HOLES
4 AFFAIR PREJUDICE
5 AFRICAN EGG
6 AGENT TRUMAN
7 AIRPLANE SIERRA
8 AIRPORT POLLOCK
9 ALABAMA DEVIL
10 ALADDIN CALENDAR
SELECT "film_id", "title" FROM "film" ORDER BY "title" DESC;
Displaying records 1 - 10
film_id title
1000 ZORRO ARK
999 ZOOLANDER FICTION
998 ZHIVAGO CORE
997 YOUTH KICK
996 YOUNG LANGUAGE
995 YENTL IDAHO
994 WYOMING STORM
993 WRONG BEHAVIOR
992 WRATH MILE
991 WORST BANGER

DISTINCT

DISTINCT operator to remove duplicates from a result set. It is equivalent to distinct() function in dplyr.

SELECT DISTINCT "last_name" FROM "actor";
Displaying records 1 - 10
last_name
AKROYD
ALLEN
ASTAIRE
BACALL
BAILEY
BALE
BALL
BARRYMORE
BASINGER
BENING
sakila_lite %>%
  tbl("actor") %>%
  distinct(last_name) %>% 
  show_query()
## <SQL>
## SELECT DISTINCT `last_name`
## FROM `actor`

LIMIT

SELECT * FROM "actor" LIMIT 2;
2 records
actor_id first_name last_name last_update
1 PENELOPE GUINESS 2019-04-11 18:11:48
2 NICK WAHLBERG 2019-04-11 18:11:48
sakila_lite %>%
  tbl("actor") %>%
  head(2) %>% 
  show_query()
## <SQL>
## SELECT *
## FROM `actor`
## LIMIT 2

WHERE

It is equivalent to filter() in dplyr.

SELECT * FROM "film" WHERE `rating` = "PG" AND "length" = 90;
1 records
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
776 SECRET GROUNDHOG A Astounding Story of a Cat And a Database Administrator who must Build a Technical Writer in New Orleans 2006 1 NA 6 4.99 90 11.99 PG Commentaries,Deleted Scenes 2019-04-11 18:11:48
SELECT * FROM "film" WHERE "rating" = 'PG' AND "length" = 90;
1 records
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating last_update special_features fulltext
776 SECRET GROUNDHOG A Astounding Story of a Cat And a Database Administrator who must Build a Technical Writer in New Orleans 2006 1 NA 6 4.99 90 11.99 PG 2017-09-10 17:46:03 {Commentaries,“Deleted Scenes”} ‘administr’:12 ‘astound’:4 ‘build’:15 ‘cat’:8 ‘databas’:11 ‘groundhog’:2 ‘must’:14 ‘new’:20 ‘orlean’:21 ‘secret’:1 ‘stori’:5 ‘technic’:17 ‘writer’:18
SELECT * FROM `film` WHERE `rating` = "PG" OR `length` = 90;
Displaying records 1 - 10
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
1 ACADEMY DINOSAUR An Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies 2006 1 NA 6 0.99 86 20.99 PG Deleted Scenes,Behind the Scenes 2006-02-15 13:03:42
6 AGENT TRUMAN A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China 2006 1 NA 3 2.99 169 17.99 PG Deleted Scenes 2006-02-15 13:03:42
12 ALASKA PHANTOM A Fanciful Saga of a Hunter And a Pastry Chef who must Vanquish a Boy in Australia 2006 1 NA 6 0.99 136 22.99 PG Commentaries,Deleted Scenes 2006-02-15 13:03:42
13 ALI FOREVER A Action-Packed Drama of a Dentist And a Crocodile who must Battle a Feminist in The Canadian Rockies 2006 1 NA 4 4.99 150 21.99 PG Deleted Scenes,Behind the Scenes 2006-02-15 13:03:42
19 AMADEUS HOLY A Emotional Display of a Pioneer And a Technical Writer who must Battle a Man in A Baloon 2006 1 NA 6 0.99 113 20.99 PG Commentaries,Deleted Scenes,Behind the Scenes 2006-02-15 13:03:42
37 ARIZONA BANG A Brilliant Panorama of a Mad Scientist And a Mad Cow who must Meet a Pioneer in A Monastery 2006 1 NA 3 2.99 121 28.99 PG Trailers,Deleted Scenes 2006-02-15 13:03:42
41 ARSENIC INDEPENDENCE A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin 2006 1 NA 4 0.99 137 17.99 PG Trailers,Deleted Scenes,Behind the Scenes 2006-02-15 13:03:42
57 BASIC EASY A Stunning Epistle of a Man And a Husband who must Reach a Mad Scientist in A Jet Boat 2006 1 NA 4 2.99 90 18.99 PG-13 Deleted Scenes 2006-02-15 13:03:42
63 BEDAZZLED MARRIED A Astounding Character Study of a Madman And a Robot who must Meet a Mad Scientist in An Abandoned Fun House 2006 1 NA 6 0.99 73 21.99 PG Trailers,Deleted Scenes,Behind the Scenes 2006-02-15 13:03:42
65 BEHAVIOR RUNAWAY A Unbelieveable Drama of a Student And a Husband who must Outrace a Sumo Wrestler in Berlin 2006 1 NA 3 4.99 100 20.99 PG Trailers,Deleted Scenes,Behind the Scenes 2006-02-15 13:03:42

Remarks:

  • For strings, SQLite (and MySQL) allows double qoutes to quote string values but it is actually not the SQL standard. In SQL standard, strings are quoted in single quotes.
  • In SQL standard, we should use = for comparison, but not ==.
  • There exists other keywords such as NOT and OR.

dplyr equivalent

sakila_lite %>%
  tbl("film") %>%
  filter(rating == "PG", length == 90)
## # Source:   lazy query [?? x 13]
## # Database: sqlite 3.30.1
## #   [/Users/Randy/Dropbox/Winter2021/sta141b/sta141b-lectures/ch02-sql/sakila.sqlite]
##   film_id title description release_year language_id original_langua…
##     <int> <chr> <chr>       <chr>              <int>            <int>
## 1     776 SECR… A Astoundi… 2006                   1               NA
## # … with 7 more variables: rental_duration <int>, rental_rate <dbl>,
## #   length <int>, replacement_cost <dbl>, rating <chr>, special_features <chr>,
## #   last_update <chr>
  • The IN operator
SELECT * FROM "film" WHERE "rating" IN ('PG', 'PG-13');
Displaying records 1 - 10
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
1 ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies 2006 1 NA 6 0.99 86 20.99 PG Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
6 AGENT TRUMAN A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China 2006 1 NA 3 2.99 169 17.99 PG Deleted Scenes 2019-04-11 18:11:48
7 AIRPLANE SIERRA A Touching Saga of a Hunter And a Butler who must Discover a Butler in A Jet Boat 2006 1 NA 6 4.99 62 28.99 PG-13 Trailers,Deleted Scenes 2019-04-11 18:11:48
9 ALABAMA DEVIL A Thoughtful Panorama of a Database Administrator And a Mad Scientist who must Outgun a Mad Scientist in A Jet Boat 2006 1 NA 3 2.99 114 21.99 PG-13 Trailers,Deleted Scenes 2019-04-11 18:11:48
12 ALASKA PHANTOM A Fanciful Saga of a Hunter And a Pastry Chef who must Vanquish a Boy in Australia 2006 1 NA 6 0.99 136 22.99 PG Commentaries,Deleted Scenes 2019-04-11 18:11:48
13 ALI FOREVER A Action-Packed Drama of a Dentist And a Crocodile who must Battle a Feminist in The Canadian Rockies 2006 1 NA 4 4.99 150 21.99 PG Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
18 ALTER VICTORY A Thoughtful Drama of a Composer And a Feminist who must Meet a Secret Agent in The Canadian Rockies 2006 1 NA 6 0.99 57 27.99 PG-13 Trailers,Behind the Scenes 2019-04-11 18:11:48
19 AMADEUS HOLY A Emotional Display of a Pioneer And a Technical Writer who must Battle a Man in A Baloon 2006 1 NA 6 0.99 113 20.99 PG Commentaries,Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
28 ANTHEM LUKE A Touching Panorama of a Waitress And a Woman who must Outrace a Dog in An Abandoned Amusement Park 2006 1 NA 5 4.99 91 16.99 PG-13 Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
33 APOLLO TEEN A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft 2006 1 NA 5 2.99 153 15.99 PG-13 Trailers,Commentaries,Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
sakila_lite %>%
  tbl("film") %>%
  filter(rating %in% c("PG", "PG-13")) %>% 
  collect()
## # A tibble: 417 x 13
##    film_id title description release_year language_id original_langua…
##      <int> <chr> <chr>       <chr>              <int>            <int>
##  1       1 ACAD… A Epic Dra… 2006                   1               NA
##  2       6 AGEN… A Intrepid… 2006                   1               NA
##  3       7 AIRP… A Touching… 2006                   1               NA
##  4       9 ALAB… A Thoughtf… 2006                   1               NA
##  5      12 ALAS… A Fanciful… 2006                   1               NA
##  6      13 ALI … A Action-P… 2006                   1               NA
##  7      18 ALTE… A Thoughtf… 2006                   1               NA
##  8      19 AMAD… A Emotiona… 2006                   1               NA
##  9      28 ANTH… A Touching… 2006                   1               NA
## 10      33 APOL… A Action-P… 2006                   1               NA
## # … with 407 more rows, and 7 more variables: rental_duration <int>,
## #   rental_rate <dbl>, length <int>, replacement_cost <dbl>, rating <chr>,
## #   special_features <chr>, last_update <chr>
  • The LIKE operator

See https://www.w3schools.com/sql/sql_like.asp

SELECT "title" FROM "film" WHERE "title" LIKE '%victory%';
2 records
title
ALTER VICTORY
VICTORY ACADEMY

Remark: in SQLite, the LIKE operator is case insensitive. However, it is not the case for other DBs.

SELECT "title" FROM "film" WHERE "title" LIKE '%victory%';
0 records
title

In Postgres, there is a ILIKE (case insensitive LIKE) operator

SELECT "title" FROM "film" WHERE "title" ILIKE '%victory%';
2 records
title
ALTER VICTORY
VICTORY ACADEMY

In MySQL (depends on the configuration),

SELECT `title` FROM `film` WHERE `title` LIKE '%victory%';
2 records
title
ALTER VICTORY
VICTORY ACADEMY

dplyr equivalent

sakila_lite %>% 
  tbl("film") %>% 
  filter(title %LIKE% "%victory%") %>% 
  select(title) %>% 
  show_query()
## <SQL>
## SELECT `title`
## FROM `film`
## WHERE (`title` LIKE '%victory%')

A slightly universal solution

SELECT "title" FROM "film" WHERE LOWER("title") LIKE '%victory%';
2 records
title
ALTER VICTORY
VICTORY ACADEMY
  • REGEX

Different servers use different operators to match regular expression. For MySQL, it is the REGEXP operator. For Postgresql, it is SIMILAR TO. For SQLite, it simiply doesn’t support regex.

Though, Postgresql’s implementation of regular expression is a bit different from the standard regex, see for example https://www.postgresql.org/docs/9.0/functions-matching.html

CASE

Similar to case_when() in dplyr.

SELECT 
  "film_id", 
  "title", 
  CASE 
    WHEN "length" < 60 THEN 'short'
    WHEN "length" < 90 THEN 'mid'
    ELSE 'long'
  END "length"
FROM "film";

JOIN operations

  • Inner Join - selects records that have matching values in both tables.
SELECT *
  FROM "inventory" a JOIN "rental" b
  ON a."inventory_id" = b."inventory_id";
Displaying records 1 - 10
inventory_id film_id store_id last_update rental_id rental_date inventory_id customer_id return_date staff_id last_update
367 80 1 2019-04-11 18:11:48 1 2005-05-24 22:53:30.000 367 130 2005-05-26 22:04:30.000 1 2019-04-11 18:11:49
1525 333 2 2019-04-11 18:11:48 2 2005-05-24 22:54:33.000 1525 459 2005-05-28 19:40:33.000 1 2019-04-11 18:11:49
1711 373 2 2019-04-11 18:11:48 3 2005-05-24 23:03:39.000 1711 408 2005-06-01 22:12:39.000 1 2019-04-11 18:11:49
2452 535 1 2019-04-11 18:11:48 4 2005-05-24 23:04:41.000 2452 333 2005-06-03 01:43:41.000 2 2019-04-11 18:11:49
2079 450 2 2019-04-11 18:11:48 5 2005-05-24 23:05:21.000 2079 222 2005-06-02 04:33:21.000 1 2019-04-11 18:11:49
2792 613 1 2019-04-11 18:11:48 6 2005-05-24 23:08:07.000 2792 549 2005-05-27 01:32:07.000 1 2019-04-11 18:11:49
3995 870 2 2019-04-11 18:11:48 7 2005-05-24 23:11:53.000 3995 269 2005-05-29 20:34:53.000 2 2019-04-11 18:11:49
2346 510 1 2019-04-11 18:11:48 8 2005-05-24 23:31:46.000 2346 239 2005-05-27 23:33:46.000 2 2019-04-11 18:11:49
2580 565 1 2019-04-11 18:11:48 9 2005-05-25 00:00:40.000 2580 126 2005-05-28 00:22:40.000 1 2019-04-11 18:11:49
1824 396 2 2019-04-11 18:11:48 10 2005-05-25 00:02:21.000 1824 399 2005-05-31 22:44:21.000 2 2019-04-11 18:11:49

If we only want store_id and customer_id,

SELECT a."store_id", b."customer_id"
  FROM "inventory" a JOIN "rental" b
  ON a."inventory_id" = b."inventory_id"
Displaying records 1 - 10
store_id customer_id
1 130
2 459
2 408
1 333
2 222
1 549
2 269
1 239
1 126
2 399
inner_join(
  tbl(sakila_lite, "inventory"), 
  tbl(sakila_lite, "rental"), 
  by = "inventory_id") %>%
  select(store_id, customer_id) %>% 
  collect()
## # A tibble: 16,044 x 2
##    store_id customer_id
##       <int>       <int>
##  1        1         130
##  2        2         459
##  3        2         408
##  4        1         333
##  5        2         222
##  6        1         549
##  7        2         269
##  8        1         239
##  9        1         126
## 10        2         399
## # … with 16,034 more rows

We could also join a more sophisticated subquery.

  • Find all the inventories that were rented but not returned.
SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"
Displaying records 1 - 10
store_id inventory_id customer_id
1 2047 155
2 2026 335
2 1545 83
2 4106 219
1 817 99
2 1857 192
2 478 11
1 1622 597
2 3043 53
1 3947 521
  • Left Join - returns all records from the left table, and the matched records from the right table
SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  LEFT JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"
Displaying records 1 - 10
store_id inventory_id customer_id
1 1 NA
1 2 NA
1 3 NA
1 4 NA
2 5 NA
2 6 554
2 7 NA
2 8 NA
2 9 366
2 10 NA
  • Right Join - opposite of Left Join
SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  RIGHT JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"
Displaying records 1 - 10
store_id inventory_id customer_id
2 6 554
2 9 366
2 21 111
2 25 590
2 70 108
1 81 236
2 97 512
2 106 44
2 112 349
2 177 317

Remark: SQLite doesn’t support Right join

  • Full Join - returns all records when there is a match in left or right table records.
SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  FULL JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"
Displaying records 1 - 10
store_id inventory_id customer_id
1 1 NA
1 2 NA
1 3 NA
1 4 NA
2 5 NA
2 6 554
2 7 NA
2 8 NA
2 9 366
2 10 NA

Remark: SQLite doesn’t support full join.

  • Semi Join - return all records in the left table which has a match in the right table.
SELECT *
  FROM "inventory" a
  WHERE EXISTS (
    SELECT * FROM "rental" b
      WHERE "return_date" IS NULL AND b."inventory_id" = a."inventory_id"
  )
Displaying records 1 - 10
inventory_id film_id store_id last_update
6 1 2 2019-04-11 18:11:48
9 2 2 2019-04-11 18:11:48
21 4 2 2019-04-11 18:11:48
25 5 2 2019-04-11 18:11:48
70 13 2 2019-04-11 18:11:48
81 17 1 2019-04-11 18:11:48
97 19 2 2019-04-11 18:11:48
106 21 2 2019-04-11 18:11:48
112 22 2 2019-04-11 18:11:48
177 39 2 2019-04-11 18:11:48
tbl(sakila_lite, "inventory") %>% 
  semi_join(
    tbl(sakila_lite, "rental") %>% filter(is.na(return_date)),
    by = "inventory_id") %>% 
  show_query()
## <SQL>
## SELECT * FROM `inventory` AS `LHS`
## WHERE EXISTS (
##   SELECT 1 FROM (SELECT *
## FROM `rental`
## WHERE (((`return_date`) IS NULL))) AS `RHS`
##   WHERE (`LHS`.`inventory_id` = `RHS`.`inventory_id`)
## )
  • Anti Join - remove all records in the left table which has a match in the right table.
SELECT *
  FROM "inventory" a
  WHERE NOT EXISTS (
    SELECT * FROM "rental" b
      WHERE b."return_date" IS NULL AND a."inventory_id" = b."inventory_id"
  )
Displaying records 1 - 10
inventory_id film_id store_id last_update
1 1 1 2019-04-11 18:11:48
2 1 1 2019-04-11 18:11:48
3 1 1 2019-04-11 18:11:48
4 1 1 2019-04-11 18:11:48
5 1 2 2019-04-11 18:11:48
7 1 2 2019-04-11 18:11:48
8 1 2 2019-04-11 18:11:48
10 2 2 2019-04-11 18:11:48
11 2 2 2019-04-11 18:11:48
12 3 2 2019-04-11 18:11:48
tbl(sakila_lite, "inventory") %>% 
  anti_join(
    tbl(sakila_lite, "rental") %>% filter(is.na(return_date)),
    by = "inventory_id") %>% 
  collect()
## # A tibble: 4,398 x 4
##    inventory_id film_id store_id last_update        
##           <int>   <int>    <int> <chr>              
##  1            1       1        1 2019-04-11 18:11:48
##  2            2       1        1 2019-04-11 18:11:48
##  3            3       1        1 2019-04-11 18:11:48
##  4            4       1        1 2019-04-11 18:11:48
##  5            5       1        2 2019-04-11 18:11:48
##  6            7       1        2 2019-04-11 18:11:48
##  7            8       1        2 2019-04-11 18:11:48
##  8           10       2        2 2019-04-11 18:11:48
##  9           11       2        2 2019-04-11 18:11:48
## 10           12       3        2 2019-04-11 18:11:48
## # … with 4,388 more rows

Aggregate Functions

  • AVG – calculate the average value of a set.
  • COUNT – return the number of items in a set.
  • SUM – return the sum all or distinct items of a set.
  • MAX – find the maximum value in a set.
  • MIN – find the minimum value in a set.
SELECT AVG("length") as "avg_length" FROM "film"
1 records
avg_length
115.272
sakila_lite %>% 
  tbl("film") %>% 
  summarize(avg_length = mean(length)) %>% 
  collect()
## # A tibble: 1 x 1
##   avg_length
##        <dbl>
## 1       115.

Group By

SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" GROUP BY "rating";
5 records
rating avg_length
G 111.0506
NC-17 113.2286
PG 112.0052
PG-13 120.4439
R 118.6615
sakila_lite %>% 
  tbl("film") %>% 
  group_by(rating) %>% 
  summarize(avg_length = mean(length)) %>% 
  collect()
## # A tibble: 5 x 2
##   rating avg_length
##   <chr>       <dbl>
## 1 G            111.
## 2 NC-17        113.
## 3 PG           112.
## 4 PG-13        120.
## 5 R            119.
SELECT "rating", "rental_duration", AVG("length") as "avg_length"
  FROM "film" GROUP BY "rating", "rental_duration"
Displaying records 1 - 10
rating rental_duration avg_length
G 3 100.4286
G 4 102.2857
G 5 109.5758
G 6 128.0000
G 7 116.3448
NC-17 3 113.3243
NC-17 4 119.1515
NC-17 5 105.4186
NC-17 6 111.7895
NC-17 7 118.7000
sakila_lite %>% 
  tbl("film") %>% 
  group_by(rating, rental_duration) %>% 
  summarize(avg_length = mean(length)) %>% 
  collect()
## # A tibble: 25 x 3
## # Groups:   rating [5]
##    rating rental_duration avg_length
##    <chr>            <int>      <dbl>
##  1 G                    3       100.
##  2 G                    4       102.
##  3 G                    5       110.
##  4 G                    6       128 
##  5 G                    7       116.
##  6 NC-17                3       113.
##  7 NC-17                4       119.
##  8 NC-17                5       105.
##  9 NC-17                6       112.
## 10 NC-17                7       119.
## # … with 15 more rows

Having Statements

SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" 
  GROUP BY "rating" 
  HAVING "avg_length" > 115;
2 records
rating avg_length
PG-13 120.4439
R 118.6615

We could simulate the HAVING clause by a subquery.

SELECT *
FROM 
  ( 
  SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" 
  GROUP BY "rating" 
  ) 
WHERE "avg_length" > 115;
2 records
rating avg_length
PG-13 120.4439
R 118.6615

Remark: Postgres requires an alias name for the subquery.

Subquery and WITH

We could use subquery to perform more complicated tasks. Eg, get the rating with maximum average length.

-- this does not work
SELECT *
FROM 
  ( 
  SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" 
  GROUP BY "rating" 
  ) "Temp"
WHERE "avg_length" = MAX("avg_length");
SELECT "rating", "avg_length" 
FROM (
    SELECT "rating", AVG("length") AS "avg_length" 
      FROM "film" GROUP BY "rating"
  ) "Temp"
WHERE "avg_length" = (
  SELECT MAX("avg_length") FROM (
    SELECT "rating", AVG("length") AS "avg_length" 
    FROM "film" GROUP BY "rating"
  ) "Temp2"
)
1 records
rating avg_length
PG-13 120.4439

THe above repeats the subquery twice.

The WITH clause allows us to refer to th temp table in WHERE statement.

WITH "Temp" AS(
    SELECT "rating", AVG("length") AS "avg_length" 
      FROM "film" GROUP BY "rating"
  )
  SELECT "rating", "avg_length" 
    FROM "Temp"
    WHERE "avg_length" = (SELECT MAX("avg_length") FROM "Temp")
1 records
rating avg_length
PG-13 120.4439

OVER

Calculate the sum of “amount” for each store.

SELECT "store_id", "amount" 
 FROM "rental" r 
  JOIN "payment" p 
  on r."rental_id" = p."rental_id"
  JOIN "inventory" i 
  on r."inventory_id" = i."inventory_id"
Displaying records 1 - 10
store_id amount
2 1.99
1 0.99
1 6.99
2 0.99
2 4.99
1 2.99
2 1.99
1 4.99
1 8.99
1 0.99
SELECT "store_id", SUM("amount") 
 FROM "rental" r 
  JOIN "payment" p 
  on r."rental_id" = p."rental_id"
  JOIN "inventory" i 
  on r."inventory_id" = i."inventory_id"
  GROUP BY "store_id"
2 records
store_id sum
2 33726.77
1 33689.74

Instead of only showing the sum, we want to show show the amounts aside the sum.

SELECT "store_id", "amount", SUM("amount") OVER (PARTITION BY "store_id")
FROM
(SELECT * FROM "rental" r 
  JOIN "payment" p 
  on r."rental_id" = p."rental_id"
  JOIN "inventory" i 
  on r."inventory_id" = i."inventory_id") "table2"
Displaying records 1 - 10
store_id amount sum
1 0.99 33689.74
1 4.99 33689.74
1 8.99 33689.74
1 2.99 33689.74
1 1.99 33689.74
1 0.99 33689.74
1 8.99 33689.74
1 6.99 33689.74
1 4.99 33689.74
1 3.99 33689.74

Remark: the OVER clause could also be used to calculate moving average.

SET Operators

  • UNION and UNION ALL – combine result set of two or more queries into a single result set using the UNION and UNION ALL operators.
  • INTERSECT – return the intersection of two or more queries using the INTERSECT operator.
  • EXCEPT – subtract a result set from another result set using the EXCEPT operator
SELECT * FROM "film" where "film_id" <= 3
UNION
SELECT * FROM "film" where "film_id" <= 4;
4 records
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
1 ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies 2006 1 NA 6 0.99 86 20.99 PG Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
2 ACE GOLDFINGER A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China 2006 1 NA 3 4.99 48 12.99 G Trailers,Deleted Scenes 2019-04-11 18:11:48
3 ADAPTATION HOLES A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory 2006 1 NA 7 2.99 50 18.99 NC-17 Trailers,Deleted Scenes 2019-04-11 18:11:48
4 AFFAIR PREJUDICE A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank 2006 1 NA 5 2.99 117 26.99 G Commentaries,Behind the Scenes 2019-04-11 18:11:48

UNION ALL doesn’t remove repeated rows

SELECT * FROM "film" where "film_id" <= 3
UNION ALL
SELECT * FROM "film" where "film_id" <= 4;
7 records
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
1 ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies 2006 1 NA 6 0.99 86 20.99 PG Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
2 ACE GOLDFINGER A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China 2006 1 NA 3 4.99 48 12.99 G Trailers,Deleted Scenes 2019-04-11 18:11:48
3 ADAPTATION HOLES A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory 2006 1 NA 7 2.99 50 18.99 NC-17 Trailers,Deleted Scenes 2019-04-11 18:11:48
1 ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies 2006 1 NA 6 0.99 86 20.99 PG Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
2 ACE GOLDFINGER A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China 2006 1 NA 3 4.99 48 12.99 G Trailers,Deleted Scenes 2019-04-11 18:11:48
3 ADAPTATION HOLES A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory 2006 1 NA 7 2.99 50 18.99 NC-17 Trailers,Deleted Scenes 2019-04-11 18:11:48
4 AFFAIR PREJUDICE A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank 2006 1 NA 5 2.99 117 26.99 G Commentaries,Behind the Scenes 2019-04-11 18:11:48
SELECT * FROM "film" where "film_id" <= 5
INTERSECT
SELECT * FROM "film" where "film_id" >= 2;
4 records
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
2 ACE GOLDFINGER A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China 2006 1 NA 3 4.99 48 12.99 G Trailers,Deleted Scenes 2019-04-11 18:11:48
3 ADAPTATION HOLES A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory 2006 1 NA 7 2.99 50 18.99 NC-17 Trailers,Deleted Scenes 2019-04-11 18:11:48
4 AFFAIR PREJUDICE A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank 2006 1 NA 5 2.99 117 26.99 G Commentaries,Behind the Scenes 2019-04-11 18:11:48
5 AFRICAN EGG A Fast-Paced Documentary of a Pastry Chef And a Dentist who must Pursue a Forensic Psychologist in The Gulf of Mexico 2006 1 NA 6 2.99 130 22.99 G Deleted Scenes 2019-04-11 18:11:48
SELECT * FROM "film" where "film_id" <= 5
EXCEPT
SELECT * FROM "film" where "film_id" >= 4;
3 records
film_id title description release_year language_id original_language_id rental_duration rental_rate length replacement_cost rating special_features last_update
1 ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies 2006 1 NA 6 0.99 86 20.99 PG Deleted Scenes,Behind the Scenes 2019-04-11 18:11:48
2 ACE GOLDFINGER A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China 2006 1 NA 3 4.99 48 12.99 G Trailers,Deleted Scenes 2019-04-11 18:11:48
3 ADAPTATION HOLES A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory 2006 1 NA 7 2.99 50 18.99 NC-17 Trailers,Deleted Scenes 2019-04-11 18:11:48

Remark: anti-join allows different column names in the tables.

Database Manipulation

# create a local empty SQLite database called mydb.sqlite
mydb <- dbConnect(RSQLite::SQLite(), dbname = "mydb.sqlite")
  • create table
mydb %>% dbCreateTable(
  "table1",
  tibble(fruit = character(0), count = integer(0))
)
mydb %>% dbReadTable("table1")
CREATE TABLE "table2" (
    "id" int NOT NULL,
    "last_name" varchar(255) NOT NULL,
    "first_name" varchar(255),
    "age" int,
    PRIMARY KEY ("id")
);

There is also dbWriteTable which exports the whole data frame as a table of the database.

  • delete a table permanently.
DROP TABLE "table2";
mydb %>% dbExecute('DROP TABLE "table2";')
drop_table <- function(db, tname) {
  drop_sql <- sqlInterpolate(db, "DROP TABLE ?tablename;", tablename = tname)
  db %>% dbExecute(drop_sql)
}

mydb %>% drop_table("table1")

Remark: the use of sqlInterpolate is to avoid SQL injection attack. Computerphile has a great video on this topic: https://www.youtube.com/watch?v=ciNHn38EyRc

  • INSERT – insert one or more rows into a table.
mydb %>% dbAppendTable(
  "table1",
  tibble(fruit = "apple", count = 2))

# alternatively
insert_sql <- mydb %>% sqlAppendTable(
  "table1",
  tibble(fruit = "apple", count = 2), row.names = FALSE)
mydb %>% dbExecute(insert_sql)

mydb %>% dbReadTable("table1")
INSERT INTO 'table2' (id, last_name, first_name, age)
  VALUES (1, "Lai", "Randy", 16);  
INSERT INTO 'table2' (id, last_name, first_name)
  VALUES (2, "Lai", "Natalie");
mydb %>% dbReadTable("table2")
  • UPDATE – update existing data in a table.
mydb %>% dbExecute('UPDATE "table2" SET "age" = 33 WHERE "id" = 1;')
UPDATE "table2" SET "age" = 33 WHERE "id" = 1;
mydb %>% dbReadTable("table2")
  • DELETE – delete data from a table permanently.
mydb %>% dbExecute('DELETE FROM "table2" WHERE "id" = 1;')
DELETE FROM "table2" WHERE "id" = 1;
mydb %>% dbReadTable("table2")

Reference