library(tidyverse)
library(DBI)

What is a database? It is what google says

a structured set of data held in a computer, especially one that is accessible in various ways.

A relational database is a type of database that stores and provides access to data points that are related to one another. Relation databases are administrated by a Relational Database Management System (RDBMS). The data in RDBMS is stored in database objects called tables. A table is a collection of related data entries and it consists of columns and rows.

There are many RDBMS - MySQL (owned by Oracle) - MariaDB (open source version MySQL) - PostgreSQL (open source, ANSI) - SQL Server (microsoft) - SQLite (open source, single file)

What is SQL? Structured Query Language (or SQL) is a standard language for accessing and manipulating relational databases. However, each RDMBS may have their own extension of the SQL language and their implmentation may vary too.

Connect to a database

We are going to use a popular database called Sakila https://www.jooq.org/sakila.

The Sakila database is a nicely normalised schema modelling a DVD rental store, featuring things like films, actors, film-actor relationships, and a central inventory table that connects films, stores, and rentals.

In the following, we are going to use sqlite, postgresql and MySql.

SQLite

The database is called sakila.sqlite. You could either git clone from lectures repo or download using the code

if (!file.exists("sakila.sqlite") || file.size("sakila.sqlite") == 0) {
  download.file(
    "https://github.com/ucdavis-sta141b-2021-winter/sta141b-lectures/raw/master/ch02-sql/sakila.sqlite", 
    destfile = "sakila.sqlite")
}

The file format is .sqlite which is one of the very common relational database formats, especially for simple problems.

sakila_lite <- dbConnect(RSQLite::SQLite(), dbname = "sakila.sqlite")
sakila_lite %>% dbListTables()

##  [1] "actor"                  "address"                "category"              
##  [4] "city"                   "country"                "customer"              
##  [7] "customer_list"          "film"                   "film_actor"            
## [10] "film_category"          "film_list"              "film_text"             
## [13] "inventory"              "language"               "payment"               
## [16] "rental"                 "sales_by_film_category" "sales_by_store"        
## [19] "sqlite_sequence"        "staff"                  "staff_list"            
## [22] "store"

sakila_lite %>% dbListFields("actor")  # column names

## [1] "actor_id"    "first_name"  "last_name"   "last_update"

Postgresql

I have also uploaded the Sakila database to a postgres server owned by the department. (You’ll need to either on the campus or over UCD vpn to connect to it)

sakila_psql <- dbConnect(RPostgres::Postgres(),
  dbname = "sakila",
  user = "psqluser", password = "secret", host = "alan.ucdavis.edu"
)
sakila_psql %>% dbListTables()

##  [1] "inventory"                  "staff"                     
##  [3] "payment_p2017_05"           "payment_p2017_06"          
##  [5] "payment"                    "actor"                     
##  [7] "store"                      "payment_p2017_02"          
##  [9] "rental"                     "language"                  
## [11] "payment_p2017_03"           "address"                   
## [13] "payment_p2017_01"           "film"                      
## [15] "country"                    "city"                      
## [17] "payment_p2017_04"           "film_actor"                
## [19] "category"                   "film_category"             
## [21] "customer"                   "actor_info"                
## [23] "customer_list"              "film_list"                 
## [25] "nicer_but_slower_film_list" "sales_by_film_category"    
## [27] "sales_by_store"             "staff_list"

MySQL

MySQL (MariaDB) is another popular RDBMS.

sakila_mysql <- dbConnect(RMariaDB::MariaDB(),
  dbname = "DATA",
  user = "student", password = "141b@2020stats", host = "alan.ucdavis.edu"
)
sakila_mysql %>% dbListTables()

##  [1] "actor"                      "address"                   
##  [3] "category"                   "city"                      
##  [5] "country"                    "customer"                  
##  [7] "customer_list"              "film"                      
##  [9] "film_actor"                 "film_category"             
## [11] "film_list"                  "film_text"                 
## [13] "inventory"                  "language"                  
## [15] "nicer_but_slower_film_list" "payment"                   
## [17] "rental"                     "sales_by_film_category"    
## [19] "sales_by_store"             "staff"                     
## [21] "staff_list"                 "store"

How not to use SQL?

dplyr provides an excellent interface for users without any SQL background to query databases.

film <- sakila_lite %>%
  tbl("film")

# to download all observations
film2 <- film %>% 
  collect()

film relies on the connection

# dbDisconnect(sakila_lite)

sakila_lite %>% tbl("film") creates a virtual table rather loading the whole table into memory.

class(film)

## [1] "tbl_SQLiteConnection" "tbl_dbi"              "tbl_sql"             
## [4] "tbl_lazy"             "tbl"

class(film2)

## [1] "tbl_df"     "tbl"        "data.frame"

sakila_lite %>% dbListFields("film")

##  [1] "film_id"              "title"                "description"         
##  [4] "release_year"         "language_id"          "original_language_id"
##  [7] "rental_duration"      "rental_rate"          "length"              
## [10] "replacement_cost"     "rating"               "special_features"    
## [13] "last_update"

film90 <- film %>%
  filter(length > 90) %>% 
  collect()
class(film90)

## [1] "tbl_df"     "tbl"        "data.frame"

film90

## # A tibble: 675 x 13
##    film_id title description release_year language_id original_langua…
##      <int> <chr> <chr>       <chr>              <int>            <int>
##  1       4 AFFA… A Fanciful… 2006                   1               NA
##  2       5 AFRI… A Fast-Pac… 2006                   1               NA
##  3       6 AGEN… A Intrepid… 2006                   1               NA
##  4       9 ALAB… A Thoughtf… 2006                   1               NA
##  5      11 ALAM… A Boring E… 2006                   1               NA
##  6      12 ALAS… A Fanciful… 2006                   1               NA
##  7      13 ALI … A Action-P… 2006                   1               NA
##  8      14 ALIC… A Emotiona… 2006                   1               NA
##  9      16 ALLE… A Fast-Pac… 2006                   1               NA
## 10      19 AMAD… A Emotiona… 2006                   1               NA
## # … with 665 more rows, and 7 more variables: rental_duration <int>,
## #   rental_rate <dbl>, length <int>, replacement_cost <dbl>, rating <chr>,
## #   special_features <chr>, last_update <chr>

Sakila queries

https://datamastery.gitlab.io/exercises/sakila-queries.html

Which actors have the first name Scarlett?

scarlett <- sakila_lite %>%
  tbl("actor") %>%
  filter(str_to_lower(first_name) == str_to_lower("Scarlett")) %>%
  collect()

Suppose we want to make the result a bit more beautiful.

scarlett %>% 
  mutate(
    first_name = str_to_title(first_name), 
    last_name = str_to_title(last_name))

## # A tibble: 2 x 4
##   actor_id first_name last_name last_update        
##      <int> <chr>      <chr>     <chr>              
## 1       81 Scarlett   Damon     2019-04-11 18:11:48
## 2      124 Scarlett   Bening    2019-04-11 18:11:48

Note: mutation is done locally in R.

It is possible to do mutation on the remote server (Postgresql sevrer)

sakila_psql %>%
  tbl("actor") %>%
  filter(str_to_lower(first_name) == str_to_lower("Scarlett")) %>%
  mutate(
    first_name = str_to_title(first_name), 
    last_name = str_to_title(last_name)) %>% 
  collect()

## # A tibble: 2 x 4
##   actor_id first_name last_name last_update        
##      <int> <chr>      <chr>     <dttm>             
## 1       81 Scarlett   Damon     2017-02-15 09:34:33
## 2      124 Scarlett   Bening    2017-02-15 09:34:33

Note: SQLite and Mysql don’t support transforming title case but Postgresql does.

Which actors have the last name Johansson?

sakila_lite %>%
  tbl("actor") %>%
  filter(str_to_lower(last_name) == "johansson") %>%
  collect()

## # A tibble: 3 x 4
##   actor_id first_name last_name last_update        
##      <int> <chr>      <chr>     <chr>              
## 1        8 MATTHEW    JOHANSSON 2019-04-11 18:11:48
## 2       64 RAY        JOHANSSON 2019-04-11 18:11:48
## 3      146 ALBERT     JOHANSSON 2019-04-11 18:11:48

How many distinct actors last names are there?

sakila_lite %>%
  tbl("actor") %>%
  summarize(n = n_distinct(last_name)) %>% 
  collect()

## # A tibble: 1 x 1
##       n
##   <int>
## 1   121

Which last names are not repeated?

sakila_lite %>%
  tbl("actor") %>%
  count(last_name) %>%
  filter(n == 1) %>%
  collect()

## # A tibble: 66 x 2
##    last_name     n
##    <chr>     <int>
##  1 ASTAIRE       1
##  2 BACALL        1
##  3 BALE          1
##  4 BALL          1
##  5 BARRYMORE     1
##  6 BASINGER      1
##  7 BERGEN        1
##  8 BERGMAN       1
##  9 BIRCH         1
## 10 BLOOM         1
## # … with 56 more rows

Which last names appear more than once?

sakila_lite %>%
  tbl("actor") %>%
  count(last_name) %>%
  filter(n > 1) %>%
  collect()

## # A tibble: 55 x 2
##    last_name     n
##    <chr>     <int>
##  1 AKROYD        3
##  2 ALLEN         3
##  3 BAILEY        2
##  4 BENING        2
##  5 BERRY         3
##  6 BOLGER        2
##  7 BRODY         2
##  8 CAGE          2
##  9 CHASE         2
## 10 CRAWFORD      2
## # … with 45 more rows

Which actor has appeared in the most films?

actor <- sakila_lite %>% tbl("actor")
sakila_lite %>%
  tbl("film_actor") %>%
  count(actor_id) %>%
  arrange(desc(n)) %>%
  head(1) %>%
  inner_join(actor, by = "actor_id") %>%
  collect()

## # A tibble: 1 x 5
##   actor_id     n first_name last_name last_update        
##      <int> <int> <chr>      <chr>     <chr>              
## 1      107    42 GINA       DEGENERES 2019-04-11 18:11:48

What is that average running time of all the films in the sakila DB?

sakila_lite %>%
  tbl("film") %>%
  summarize(m = mean(length)) %>%
  collect()

## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.

## # A tibble: 1 x 1
##       m
##   <dbl>
## 1  115.

What is the average running time of films by category?

sakila_lite %>%
  tbl("film") %>%
  left_join(tbl(sakila_lite, "film_category"), by = "film_id") %>%
  group_by(category_id) %>%
  summarize(mean_length = mean(length)) %>%
  left_join(tbl(sakila_lite, "category"), by = "category_id") %>%
  select(name, mean_length) %>%
  collect()

## # A tibble: 16 x 2
##    name        mean_length
##    <chr>             <dbl>
##  1 Action             112.
##  2 Animation          111.
##  3 Children           110.
##  4 Classics           112.
##  5 Comedy             116.
##  6 Documentary        109.
##  7 Drama              121.
##  8 Family             115.
##  9 Foreign            122.
## 10 Games              128.
## 11 Horror             112.
## 12 Music              114.
## 13 New                111.
## 14 Sci-Fi             108.
## 15 Sports             128.
## 16 Travel             113.

Is ‘Unforgiven Zoolander’ available for rent from Store 1?

uz <- sakila_lite %>%
  tbl("film") %>%
  filter(str_to_lower(title) == str_to_lower("Unforgiven Zoolander")) %>%
  select(film_id)
all_inventories_of_store1 <- sakila_lite %>%
  tbl("inventory") %>%
  filter(store_id == 1) %>%
  select(film_id, inventory_id)
not_yet_returned <- sakila_lite %>%
  tbl("rental") %>%
  filter(is.na(return_date)) %>%
  select(inventory_id)
uz %>%
  inner_join(all_inventories_of_store1, by = "film_id") %>%
  anti_join(not_yet_returned, by = "inventory_id") %>%
  count() %>%
  collect()

## # A tibble: 1 x 1
##       n
##   <int>
## 1     2

SQL

We just see some example queries of a relational database. Behind the scene, we are using a language called SQL. For example, in the last query, the SQL used is

uz %>%
  inner_join(all_inventories_of_store1) %>%
  anti_join(not_yet_returned) %>%
  count() %>%
  show_query()

## Joining, by = "film_id"

## Joining, by = "inventory_id"

## <SQL>
## SELECT COUNT() AS `n`
## FROM (SELECT * FROM (SELECT `LHS`.`film_id` AS `film_id`, `RHS`.`inventory_id` AS `inventory_id`
## FROM (SELECT `film_id`
## FROM `film`
## WHERE (LOWER(`title`) = LOWER('Unforgiven Zoolander'))) AS `LHS`
## INNER JOIN (SELECT `film_id`, `inventory_id`
## FROM `inventory`
## WHERE (`store_id` = 1.0)) AS `RHS`
## ON (`LHS`.`film_id` = `RHS`.`film_id`)
## ) AS `LHS`
## WHERE NOT EXISTS (
##   SELECT 1 FROM (SELECT `inventory_id`
## FROM `rental`
## WHERE (((`return_date`) IS NULL))) AS `RHS`
##   WHERE (`LHS`.`inventory_id` = `RHS`.`inventory_id`)
## ))

Why learning SQL when there is dplyr?

SQL is everywhere (used in python, php, etc..)
dplyr magics only read, do not write
Job interviews

In R, a SQL query can be made by using dbGetQuery which counts number of rows in rental.

sakila_lite %>%
  dbGetQuery("SELECT COUNT(*) AS `n` FROM `rental`")

##       n
## 1 16044

We could also make SQL query by sql block. In here, we are using the connection sakila_lite. The result will be printed directly.

SELECT COUNT(*) AS `n` FROM `rental`

1 records
n
16044

In we need the output, set output.var to rental_count

SELECT COUNT(*) AS `n` FROM `rental`;

The output could be later used in R blocks

rental_count

##       n
## 1 16044

For comparison, in Python, we use

import sqlite3
sakila = sqlite3.connect('sakila.sqlite')
c = sakila.cursor()
c.execute("SELECT COUNT(*) AS `n` FROM `rental`;")
c.fetchall()

Identifier quotation

SQLite supports both double quotes (which is the standard) and backticks to quote identifiers (table and column names). Backticks are used in another popular database MySQL. Double quotes are used in Postgresql. It is always a good practice to quote the identifiers.

-- MySQL
SELECT COUNT(*) AS `n` from `actor`;

1 records
n
200

-- PostgresQL (ANSI standard)
SELECT COUNT(*) AS "n" from "actor";

1 records
n
200

SQLite supports both styles.

SELECT COUNT(*) AS `n` from `actor`;

1 records
n
200

SELECT

The SELECT statement is pretty much the select() function in dplyr.

SELECT * FROM "rental";

Displaying records 1 - 10
rental_id	rental_date	inventory_id	customer_id	return_date	staff_id	last_update
1	2005-05-24 22:53:30.000	367	130	2005-05-26 22:04:30.000	1	2019-04-11 18:11:49
2	2005-05-24 22:54:33.000	1525	459	2005-05-28 19:40:33.000	1	2019-04-11 18:11:49
3	2005-05-24 23:03:39.000	1711	408	2005-06-01 22:12:39.000	1	2019-04-11 18:11:49
4	2005-05-24 23:04:41.000	2452	333	2005-06-03 01:43:41.000	2	2019-04-11 18:11:49
5	2005-05-24 23:05:21.000	2079	222	2005-06-02 04:33:21.000	1	2019-04-11 18:11:49
6	2005-05-24 23:08:07.000	2792	549	2005-05-27 01:32:07.000	1	2019-04-11 18:11:49
7	2005-05-24 23:11:53.000	3995	269	2005-05-29 20:34:53.000	2	2019-04-11 18:11:49
8	2005-05-24 23:31:46.000	2346	239	2005-05-27 23:33:46.000	2	2019-04-11 18:11:49
9	2005-05-25 00:00:40.000	2580	126	2005-05-28 00:22:40.000	1	2019-04-11 18:11:49
10	2005-05-25 00:02:21.000	1824	399	2005-05-31 22:44:21.000	2	2019-04-11 18:11:49

SELECT LOWER("last_name") AS "family_name" FROM "actor";

Displaying records 1 - 10
family_name
akroyd
akroyd
akroyd
allen
allen
allen
astaire
bacall
bailey
bailey

For comparison,

sakila_lite %>%
  tbl("actor") %>%
  transmute(family_name = str_to_lower(last_name)) %>%
  collect()

## # A tibble: 200 x 1
##    family_name
##    <chr>      
##  1 akroyd     
##  2 akroyd     
##  3 akroyd     
##  4 allen      
##  5 allen      
##  6 allen      
##  7 astaire    
##  8 bacall     
##  9 bailey     
## 10 bailey     
## # … with 190 more rows

sakila_lite %>%
  tbl("actor") %>%
  transmute(family_name = str_to_lower(last_name)) %>%
  show_query()

## <SQL>
## SELECT LOWER(`last_name`) AS `family_name`
## FROM `actor`

To select every column or some columns,

SELECT * FROM "actor";

Displaying records 1 - 10
actor_id	first_name	last_name	last_update
1	PENELOPE	GUINESS	2019-04-11 18:11:48
2	NICK	WAHLBERG	2019-04-11 18:11:48
3	ED	CHASE	2019-04-11 18:11:48
4	JENNIFER	DAVIS	2019-04-11 18:11:48
5	JOHNNY	LOLLOBRIGIDA	2019-04-11 18:11:48
6	BETTE	NICHOLSON	2019-04-11 18:11:48
7	GRACE	MOSTEL	2019-04-11 18:11:48
8	MATTHEW	JOHANSSON	2019-04-11 18:11:48
9	JOE	SWANK	2019-04-11 18:11:48
10	CHRISTIAN	GABLE	2019-04-11 18:11:48

SELECT "rental_id", "last_update" AS "last_update_time" FROM "rental";

Displaying records 1 - 10
rental_id	last_update_time
1	2019-04-11 18:11:49
2	2019-04-11 18:11:49
3	2019-04-11 18:11:49
4	2019-04-11 18:11:49
5	2019-04-11 18:11:49
6	2019-04-11 18:11:49
7	2019-04-11 18:11:49
8	2019-04-11 18:11:49
9	2019-04-11 18:11:49
10	2019-04-11 18:11:49

ORDER BY Clause

It is equivalent to arrange() in dplyr

SELECT "film_id", "title" FROM "film" ORDER BY "title";

Displaying records 1 - 10
film_id	title
1	ACADEMY DINOSAUR
2	ACE GOLDFINGER
3	ADAPTATION HOLES
4	AFFAIR PREJUDICE
5	AFRICAN EGG
6	AGENT TRUMAN
7	AIRPLANE SIERRA
8	AIRPORT POLLOCK
9	ALABAMA DEVIL
10	ALADDIN CALENDAR

SELECT "film_id", "title" FROM "film" ORDER BY "title" DESC;

Displaying records 1 - 10
film_id	title
1000	ZORRO ARK
999	ZOOLANDER FICTION
998	ZHIVAGO CORE
997	YOUTH KICK
996	YOUNG LANGUAGE
995	YENTL IDAHO
994	WYOMING STORM
993	WRONG BEHAVIOR
992	WRATH MILE
991	WORST BANGER

DISTINCT

DISTINCT operator to remove duplicates from a result set. It is equivalent to distinct() function in dplyr.

SELECT DISTINCT "last_name" FROM "actor";

Displaying records 1 - 10
last_name
AKROYD
ALLEN
ASTAIRE
BACALL
BAILEY
BALE
BALL
BARRYMORE
BASINGER
BENING

sakila_lite %>%
  tbl("actor") %>%
  distinct(last_name) %>% 
  show_query()

## <SQL>
## SELECT DISTINCT `last_name`
## FROM `actor`

LIMIT

SELECT * FROM "actor" LIMIT 2;

2 records
actor_id	first_name	last_name	last_update
1	PENELOPE	GUINESS	2019-04-11 18:11:48
2	NICK	WAHLBERG	2019-04-11 18:11:48

sakila_lite %>%
  tbl("actor") %>%
  head(2) %>% 
  show_query()

## <SQL>
## SELECT *
## FROM `actor`
## LIMIT 2

WHERE

It is equivalent to filter() in dplyr.

SELECT * FROM "film" WHERE `rating` = "PG" AND "length" = 90;

1 records
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
776	SECRET GROUNDHOG	A Astounding Story of a Cat And a Database Administrator who must Build a Technical Writer in New Orleans	2006	1	NA	6	4.99	90	11.99	PG	Commentaries,Deleted Scenes	2019-04-11 18:11:48

SELECT * FROM "film" WHERE "rating" = 'PG' AND "length" = 90;

1 records
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	last_update	special_features	fulltext
776	SECRET GROUNDHOG	A Astounding Story of a Cat And a Database Administrator who must Build a Technical Writer in New Orleans	2006	1	NA	6	4.99	90	11.99	PG	2017-09-10 17:46:03	{Commentaries,“Deleted Scenes”}	‘administr’:12 ‘astound’:4 ‘build’:15 ‘cat’:8 ‘databas’:11 ‘groundhog’:2 ‘must’:14 ‘new’:20 ‘orlean’:21 ‘secret’:1 ‘stori’:5 ‘technic’:17 ‘writer’:18

SELECT * FROM `film` WHERE `rating` = "PG" OR `length` = 90;

Displaying records 1 - 10
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
1	ACADEMY DINOSAUR	An Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies	2006	1	NA	6	0.99	86	20.99	PG	Deleted Scenes,Behind the Scenes	2006-02-15 13:03:42
6	AGENT TRUMAN	A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China	2006	1	NA	3	2.99	169	17.99	PG	Deleted Scenes	2006-02-15 13:03:42
12	ALASKA PHANTOM	A Fanciful Saga of a Hunter And a Pastry Chef who must Vanquish a Boy in Australia	2006	1	NA	6	0.99	136	22.99	PG	Commentaries,Deleted Scenes	2006-02-15 13:03:42
13	ALI FOREVER	A Action-Packed Drama of a Dentist And a Crocodile who must Battle a Feminist in The Canadian Rockies	2006	1	NA	4	4.99	150	21.99	PG	Deleted Scenes,Behind the Scenes	2006-02-15 13:03:42
19	AMADEUS HOLY	A Emotional Display of a Pioneer And a Technical Writer who must Battle a Man in A Baloon	2006	1	NA	6	0.99	113	20.99	PG	Commentaries,Deleted Scenes,Behind the Scenes	2006-02-15 13:03:42
37	ARIZONA BANG	A Brilliant Panorama of a Mad Scientist And a Mad Cow who must Meet a Pioneer in A Monastery	2006	1	NA	3	2.99	121	28.99	PG	Trailers,Deleted Scenes	2006-02-15 13:03:42
41	ARSENIC INDEPENDENCE	A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin	2006	1	NA	4	0.99	137	17.99	PG	Trailers,Deleted Scenes,Behind the Scenes	2006-02-15 13:03:42
57	BASIC EASY	A Stunning Epistle of a Man And a Husband who must Reach a Mad Scientist in A Jet Boat	2006	1	NA	4	2.99	90	18.99	PG-13	Deleted Scenes	2006-02-15 13:03:42
63	BEDAZZLED MARRIED	A Astounding Character Study of a Madman And a Robot who must Meet a Mad Scientist in An Abandoned Fun House	2006	1	NA	6	0.99	73	21.99	PG	Trailers,Deleted Scenes,Behind the Scenes	2006-02-15 13:03:42
65	BEHAVIOR RUNAWAY	A Unbelieveable Drama of a Student And a Husband who must Outrace a Sumo Wrestler in Berlin	2006	1	NA	3	4.99	100	20.99	PG	Trailers,Deleted Scenes,Behind the Scenes	2006-02-15 13:03:42

Remarks:

For strings, SQLite (and MySQL) allows double qoutes to quote string values but it is actually not the SQL standard. In SQL standard, strings are quoted in single quotes.
In SQL standard, we should use = for comparison, but not ==.
There exists other keywords such as NOT and OR.

dplyr equivalent

sakila_lite %>%
  tbl("film") %>%
  filter(rating == "PG", length == 90)

## # Source:   lazy query [?? x 13]
## # Database: sqlite 3.30.1
## #   [/Users/Randy/Dropbox/Winter2021/sta141b/sta141b-lectures/ch02-sql/sakila.sqlite]
##   film_id title description release_year language_id original_langua…
##     <int> <chr> <chr>       <chr>              <int>            <int>
## 1     776 SECR… A Astoundi… 2006                   1               NA
## # … with 7 more variables: rental_duration <int>, rental_rate <dbl>,
## #   length <int>, replacement_cost <dbl>, rating <chr>, special_features <chr>,
## #   last_update <chr>

The IN operator

SELECT * FROM "film" WHERE "rating" IN ('PG', 'PG-13');

Displaying records 1 - 10
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
1	ACADEMY DINOSAUR	A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies	2006	1	NA	6	0.99	86	20.99	PG	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
6	AGENT TRUMAN	A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China	2006	1	NA	3	2.99	169	17.99	PG	Deleted Scenes	2019-04-11 18:11:48
7	AIRPLANE SIERRA	A Touching Saga of a Hunter And a Butler who must Discover a Butler in A Jet Boat	2006	1	NA	6	4.99	62	28.99	PG-13	Trailers,Deleted Scenes	2019-04-11 18:11:48
9	ALABAMA DEVIL	A Thoughtful Panorama of a Database Administrator And a Mad Scientist who must Outgun a Mad Scientist in A Jet Boat	2006	1	NA	3	2.99	114	21.99	PG-13	Trailers,Deleted Scenes	2019-04-11 18:11:48
12	ALASKA PHANTOM	A Fanciful Saga of a Hunter And a Pastry Chef who must Vanquish a Boy in Australia	2006	1	NA	6	0.99	136	22.99	PG	Commentaries,Deleted Scenes	2019-04-11 18:11:48
13	ALI FOREVER	A Action-Packed Drama of a Dentist And a Crocodile who must Battle a Feminist in The Canadian Rockies	2006	1	NA	4	4.99	150	21.99	PG	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
18	ALTER VICTORY	A Thoughtful Drama of a Composer And a Feminist who must Meet a Secret Agent in The Canadian Rockies	2006	1	NA	6	0.99	57	27.99	PG-13	Trailers,Behind the Scenes	2019-04-11 18:11:48
19	AMADEUS HOLY	A Emotional Display of a Pioneer And a Technical Writer who must Battle a Man in A Baloon	2006	1	NA	6	0.99	113	20.99	PG	Commentaries,Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
28	ANTHEM LUKE	A Touching Panorama of a Waitress And a Woman who must Outrace a Dog in An Abandoned Amusement Park	2006	1	NA	5	4.99	91	16.99	PG-13	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
33	APOLLO TEEN	A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft	2006	1	NA	5	2.99	153	15.99	PG-13	Trailers,Commentaries,Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48

sakila_lite %>%
  tbl("film") %>%
  filter(rating %in% c("PG", "PG-13")) %>% 
  collect()

## # A tibble: 417 x 13
##    film_id title description release_year language_id original_langua…
##      <int> <chr> <chr>       <chr>              <int>            <int>
##  1       1 ACAD… A Epic Dra… 2006                   1               NA
##  2       6 AGEN… A Intrepid… 2006                   1               NA
##  3       7 AIRP… A Touching… 2006                   1               NA
##  4       9 ALAB… A Thoughtf… 2006                   1               NA
##  5      12 ALAS… A Fanciful… 2006                   1               NA
##  6      13 ALI … A Action-P… 2006                   1               NA
##  7      18 ALTE… A Thoughtf… 2006                   1               NA
##  8      19 AMAD… A Emotiona… 2006                   1               NA
##  9      28 ANTH… A Touching… 2006                   1               NA
## 10      33 APOL… A Action-P… 2006                   1               NA
## # … with 407 more rows, and 7 more variables: rental_duration <int>,
## #   rental_rate <dbl>, length <int>, replacement_cost <dbl>, rating <chr>,
## #   special_features <chr>, last_update <chr>

The LIKE operator

See https://www.w3schools.com/sql/sql_like.asp

SELECT "title" FROM "film" WHERE "title" LIKE '%victory%';

2 records
title
ALTER VICTORY
VICTORY ACADEMY

Remark: in SQLite, the LIKE operator is case insensitive. However, it is not the case for other DBs.

SELECT "title" FROM "film" WHERE "title" LIKE '%victory%';

0 records
title

In Postgres, there is a ILIKE (case insensitive LIKE) operator

SELECT "title" FROM "film" WHERE "title" ILIKE '%victory%';

2 records
title
ALTER VICTORY
VICTORY ACADEMY

In MySQL (depends on the configuration),

SELECT `title` FROM `film` WHERE `title` LIKE '%victory%';

2 records
title
ALTER VICTORY
VICTORY ACADEMY

dplyr equivalent

sakila_lite %>% 
  tbl("film") %>% 
  filter(title %LIKE% "%victory%") %>% 
  select(title) %>% 
  show_query()

## <SQL>
## SELECT `title`
## FROM `film`
## WHERE (`title` LIKE '%victory%')

A slightly universal solution

SELECT "title" FROM "film" WHERE LOWER("title") LIKE '%victory%';

2 records
title
ALTER VICTORY
VICTORY ACADEMY

REGEX

Different servers use different operators to match regular expression. For MySQL, it is the REGEXP operator. For Postgresql, it is SIMILAR TO. For SQLite, it simiply doesn’t support regex.

Though, Postgresql’s implementation of regular expression is a bit different from the standard regex, see for example https://www.postgresql.org/docs/9.0/functions-matching.html

CASE

Similar to case_when() in dplyr.

SELECT 
  "film_id", 
  "title", 
  CASE 
    WHEN "length" < 60 THEN 'short'
    WHEN "length" < 90 THEN 'mid'
    ELSE 'long'
  END "length"
FROM "film";

JOIN operations

Inner Join - selects records that have matching values in both tables.

SELECT *
  FROM "inventory" a JOIN "rental" b
  ON a."inventory_id" = b."inventory_id";

Displaying records 1 - 10
inventory_id	film_id	store_id	last_update	rental_id	rental_date	inventory_id	customer_id	return_date	staff_id	last_update
367	80	1	2019-04-11 18:11:48	1	2005-05-24 22:53:30.000	367	130	2005-05-26 22:04:30.000	1	2019-04-11 18:11:49
1525	333	2	2019-04-11 18:11:48	2	2005-05-24 22:54:33.000	1525	459	2005-05-28 19:40:33.000	1	2019-04-11 18:11:49
1711	373	2	2019-04-11 18:11:48	3	2005-05-24 23:03:39.000	1711	408	2005-06-01 22:12:39.000	1	2019-04-11 18:11:49
2452	535	1	2019-04-11 18:11:48	4	2005-05-24 23:04:41.000	2452	333	2005-06-03 01:43:41.000	2	2019-04-11 18:11:49
2079	450	2	2019-04-11 18:11:48	5	2005-05-24 23:05:21.000	2079	222	2005-06-02 04:33:21.000	1	2019-04-11 18:11:49
2792	613	1	2019-04-11 18:11:48	6	2005-05-24 23:08:07.000	2792	549	2005-05-27 01:32:07.000	1	2019-04-11 18:11:49
3995	870	2	2019-04-11 18:11:48	7	2005-05-24 23:11:53.000	3995	269	2005-05-29 20:34:53.000	2	2019-04-11 18:11:49
2346	510	1	2019-04-11 18:11:48	8	2005-05-24 23:31:46.000	2346	239	2005-05-27 23:33:46.000	2	2019-04-11 18:11:49
2580	565	1	2019-04-11 18:11:48	9	2005-05-25 00:00:40.000	2580	126	2005-05-28 00:22:40.000	1	2019-04-11 18:11:49
1824	396	2	2019-04-11 18:11:48	10	2005-05-25 00:02:21.000	1824	399	2005-05-31 22:44:21.000	2	2019-04-11 18:11:49

If we only want store_id and customer_id,

SELECT a."store_id", b."customer_id"
  FROM "inventory" a JOIN "rental" b
  ON a."inventory_id" = b."inventory_id"

Displaying records 1 - 10
store_id	customer_id
1	130
2	459
2	408
1	333
2	222
1	549
2	269
1	239
1	126
2	399

inner_join(
  tbl(sakila_lite, "inventory"), 
  tbl(sakila_lite, "rental"), 
  by = "inventory_id") %>%
  select(store_id, customer_id) %>% 
  collect()

## # A tibble: 16,044 x 2
##    store_id customer_id
##       <int>       <int>
##  1        1         130
##  2        2         459
##  3        2         408
##  4        1         333
##  5        2         222
##  6        1         549
##  7        2         269
##  8        1         239
##  9        1         126
## 10        2         399
## # … with 16,034 more rows

We could also join a more sophisticated subquery.

Find all the inventories that were rented but not returned.

SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"

Displaying records 1 - 10
store_id	inventory_id	customer_id
1	2047	155
2	2026	335
2	1545	83
2	4106	219
1	817	99
2	1857	192
2	478	11
1	1622	597
2	3043	53
1	3947	521

Left Join - returns all records from the left table, and the matched records from the right table

SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  LEFT JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"

Displaying records 1 - 10
store_id	inventory_id	customer_id
1	1	NA
1	2	NA
1	3	NA
1	4	NA
2	5	NA
2	6	554
2	7	NA
2	8	NA
2	9	366
2	10	NA

Right Join - opposite of Left Join

SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  RIGHT JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"

Displaying records 1 - 10
store_id	inventory_id	customer_id
2	6	554
2	9	366
2	21	111
2	25	590
2	70	108
1	81	236
2	97	512
2	106	44
2	112	349
2	177	317

Remark: SQLite doesn’t support Right join

Full Join - returns all records when there is a match in left or right table records.

SELECT a."store_id", a."inventory_id", b."customer_id"
  FROM "inventory" a 
  FULL JOIN (
    SELECT * FROM "rental" WHERE "return_date" IS NULL
  ) b ON a."inventory_id" = b."inventory_id"

Displaying records 1 - 10
store_id	inventory_id	customer_id
1	1	NA
1	2	NA
1	3	NA
1	4	NA
2	5	NA
2	6	554
2	7	NA
2	8	NA
2	9	366
2	10	NA

Remark: SQLite doesn’t support full join.

Semi Join - return all records in the left table which has a match in the right table.

SELECT *
  FROM "inventory" a
  WHERE EXISTS (
    SELECT * FROM "rental" b
      WHERE "return_date" IS NULL AND b."inventory_id" = a."inventory_id"
  )

Displaying records 1 - 10
inventory_id	film_id	store_id	last_update
6	1	2	2019-04-11 18:11:48
9	2	2	2019-04-11 18:11:48
21	4	2	2019-04-11 18:11:48
25	5	2	2019-04-11 18:11:48
70	13	2	2019-04-11 18:11:48
81	17	1	2019-04-11 18:11:48
97	19	2	2019-04-11 18:11:48
106	21	2	2019-04-11 18:11:48
112	22	2	2019-04-11 18:11:48
177	39	2	2019-04-11 18:11:48

tbl(sakila_lite, "inventory") %>% 
  semi_join(
    tbl(sakila_lite, "rental") %>% filter(is.na(return_date)),
    by = "inventory_id") %>% 
  show_query()

## <SQL>
## SELECT * FROM `inventory` AS `LHS`
## WHERE EXISTS (
##   SELECT 1 FROM (SELECT *
## FROM `rental`
## WHERE (((`return_date`) IS NULL))) AS `RHS`
##   WHERE (`LHS`.`inventory_id` = `RHS`.`inventory_id`)
## )

Anti Join - remove all records in the left table which has a match in the right table.

SELECT *
  FROM "inventory" a
  WHERE NOT EXISTS (
    SELECT * FROM "rental" b
      WHERE b."return_date" IS NULL AND a."inventory_id" = b."inventory_id"
  )

Displaying records 1 - 10
inventory_id	film_id	store_id	last_update
1	1	1	2019-04-11 18:11:48
2	1	1	2019-04-11 18:11:48
3	1	1	2019-04-11 18:11:48
4	1	1	2019-04-11 18:11:48
5	1	2	2019-04-11 18:11:48
7	1	2	2019-04-11 18:11:48
8	1	2	2019-04-11 18:11:48
10	2	2	2019-04-11 18:11:48
11	2	2	2019-04-11 18:11:48
12	3	2	2019-04-11 18:11:48

tbl(sakila_lite, "inventory") %>% 
  anti_join(
    tbl(sakila_lite, "rental") %>% filter(is.na(return_date)),
    by = "inventory_id") %>% 
  collect()

## # A tibble: 4,398 x 4
##    inventory_id film_id store_id last_update        
##           <int>   <int>    <int> <chr>              
##  1            1       1        1 2019-04-11 18:11:48
##  2            2       1        1 2019-04-11 18:11:48
##  3            3       1        1 2019-04-11 18:11:48
##  4            4       1        1 2019-04-11 18:11:48
##  5            5       1        2 2019-04-11 18:11:48
##  6            7       1        2 2019-04-11 18:11:48
##  7            8       1        2 2019-04-11 18:11:48
##  8           10       2        2 2019-04-11 18:11:48
##  9           11       2        2 2019-04-11 18:11:48
## 10           12       3        2 2019-04-11 18:11:48
## # … with 4,388 more rows

Aggregate Functions

AVG – calculate the average value of a set.
COUNT – return the number of items in a set.
SUM – return the sum all or distinct items of a set.
MAX – find the maximum value in a set.
MIN – find the minimum value in a set.

SELECT AVG("length") as "avg_length" FROM "film"

1 records
avg_length
115.272

sakila_lite %>% 
  tbl("film") %>% 
  summarize(avg_length = mean(length)) %>% 
  collect()

## # A tibble: 1 x 1
##   avg_length
##        <dbl>
## 1       115.

Group By

SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" GROUP BY "rating";

5 records
rating	avg_length
G	111.0506
NC-17	113.2286
PG	112.0052
PG-13	120.4439
R	118.6615

sakila_lite %>% 
  tbl("film") %>% 
  group_by(rating) %>% 
  summarize(avg_length = mean(length)) %>% 
  collect()

## # A tibble: 5 x 2
##   rating avg_length
##   <chr>       <dbl>
## 1 G            111.
## 2 NC-17        113.
## 3 PG           112.
## 4 PG-13        120.
## 5 R            119.

SELECT "rating", "rental_duration", AVG("length") as "avg_length"
  FROM "film" GROUP BY "rating", "rental_duration"

Displaying records 1 - 10
rating	rental_duration	avg_length
G	3	100.4286
G	4	102.2857
G	5	109.5758
G	6	128.0000
G	7	116.3448
NC-17	3	113.3243
NC-17	4	119.1515
NC-17	5	105.4186
NC-17	6	111.7895
NC-17	7	118.7000

sakila_lite %>% 
  tbl("film") %>% 
  group_by(rating, rental_duration) %>% 
  summarize(avg_length = mean(length)) %>% 
  collect()

## # A tibble: 25 x 3
## # Groups:   rating [5]
##    rating rental_duration avg_length
##    <chr>            <int>      <dbl>
##  1 G                    3       100.
##  2 G                    4       102.
##  3 G                    5       110.
##  4 G                    6       128 
##  5 G                    7       116.
##  6 NC-17                3       113.
##  7 NC-17                4       119.
##  8 NC-17                5       105.
##  9 NC-17                6       112.
## 10 NC-17                7       119.
## # … with 15 more rows

Having Statements

SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" 
  GROUP BY "rating" 
  HAVING "avg_length" > 115;

2 records
rating	avg_length
PG-13	120.4439
R	118.6615

We could simulate the HAVING clause by a subquery.

SELECT *
FROM 
  ( 
  SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" 
  GROUP BY "rating" 
  ) 
WHERE "avg_length" > 115;

2 records
rating	avg_length
PG-13	120.4439
R	118.6615

Remark: Postgres requires an alias name for the subquery.

Subquery and WITH

We could use subquery to perform more complicated tasks. Eg, get the rating with maximum average length.

-- this does not work
SELECT *
FROM 
  ( 
  SELECT "rating", AVG("length") AS "avg_length" 
  FROM "film" 
  GROUP BY "rating" 
  ) "Temp"
WHERE "avg_length" = MAX("avg_length");

SELECT "rating", "avg_length" 
FROM (
    SELECT "rating", AVG("length") AS "avg_length" 
      FROM "film" GROUP BY "rating"
  ) "Temp"
WHERE "avg_length" = (
  SELECT MAX("avg_length") FROM (
    SELECT "rating", AVG("length") AS "avg_length" 
    FROM "film" GROUP BY "rating"
  ) "Temp2"
)

1 records
rating	avg_length
PG-13	120.4439

THe above repeats the subquery twice.

The WITH clause allows us to refer to th temp table in WHERE statement.

WITH "Temp" AS(
    SELECT "rating", AVG("length") AS "avg_length" 
      FROM "film" GROUP BY "rating"
  )
  SELECT "rating", "avg_length" 
    FROM "Temp"
    WHERE "avg_length" = (SELECT MAX("avg_length") FROM "Temp")

1 records
rating	avg_length
PG-13	120.4439

OVER

Calculate the sum of “amount” for each store.

SELECT "store_id", "amount" 
 FROM "rental" r 
  JOIN "payment" p 
  on r."rental_id" = p."rental_id"
  JOIN "inventory" i 
  on r."inventory_id" = i."inventory_id"

Displaying records 1 - 10
store_id	amount
2	1.99
1	0.99
1	6.99
2	0.99
2	4.99
1	2.99
2	1.99
1	4.99
1	8.99
1	0.99

SELECT "store_id", SUM("amount") 
 FROM "rental" r 
  JOIN "payment" p 
  on r."rental_id" = p."rental_id"
  JOIN "inventory" i 
  on r."inventory_id" = i."inventory_id"
  GROUP BY "store_id"

2 records
store_id	sum
2	33726.77
1	33689.74

Instead of only showing the sum, we want to show show the amounts aside the sum.

SELECT "store_id", "amount", SUM("amount") OVER (PARTITION BY "store_id")
FROM
(SELECT * FROM "rental" r 
  JOIN "payment" p 
  on r."rental_id" = p."rental_id"
  JOIN "inventory" i 
  on r."inventory_id" = i."inventory_id") "table2"

Displaying records 1 - 10
store_id	amount	sum
1	0.99	33689.74
1	4.99	33689.74
1	8.99	33689.74
1	2.99	33689.74
1	1.99	33689.74
1	0.99	33689.74
1	8.99	33689.74
1	6.99	33689.74
1	4.99	33689.74
1	3.99	33689.74

Remark: the OVER clause could also be used to calculate moving average.

SET Operators

UNION and UNION ALL – combine result set of two or more queries into a single result set using the UNION and UNION ALL operators.
INTERSECT – return the intersection of two or more queries using the INTERSECT operator.
EXCEPT – subtract a result set from another result set using the EXCEPT operator

SELECT * FROM "film" where "film_id" <= 3
UNION
SELECT * FROM "film" where "film_id" <= 4;

4 records
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
1	ACADEMY DINOSAUR	A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies	2006	1	NA	6	0.99	86	20.99	PG	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
2	ACE GOLDFINGER	A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China	2006	1	NA	3	4.99	48	12.99	G	Trailers,Deleted Scenes	2019-04-11 18:11:48
3	ADAPTATION HOLES	A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory	2006	1	NA	7	2.99	50	18.99	NC-17	Trailers,Deleted Scenes	2019-04-11 18:11:48
4	AFFAIR PREJUDICE	A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank	2006	1	NA	5	2.99	117	26.99	G	Commentaries,Behind the Scenes	2019-04-11 18:11:48

UNION ALL doesn’t remove repeated rows

SELECT * FROM "film" where "film_id" <= 3
UNION ALL
SELECT * FROM "film" where "film_id" <= 4;

7 records
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
1	ACADEMY DINOSAUR	A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies	2006	1	NA	6	0.99	86	20.99	PG	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
2	ACE GOLDFINGER	A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China	2006	1	NA	3	4.99	48	12.99	G	Trailers,Deleted Scenes	2019-04-11 18:11:48
3	ADAPTATION HOLES	A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory	2006	1	NA	7	2.99	50	18.99	NC-17	Trailers,Deleted Scenes	2019-04-11 18:11:48
1	ACADEMY DINOSAUR	A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies	2006	1	NA	6	0.99	86	20.99	PG	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
2	ACE GOLDFINGER	A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China	2006	1	NA	3	4.99	48	12.99	G	Trailers,Deleted Scenes	2019-04-11 18:11:48
3	ADAPTATION HOLES	A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory	2006	1	NA	7	2.99	50	18.99	NC-17	Trailers,Deleted Scenes	2019-04-11 18:11:48
4	AFFAIR PREJUDICE	A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank	2006	1	NA	5	2.99	117	26.99	G	Commentaries,Behind the Scenes	2019-04-11 18:11:48

SELECT * FROM "film" where "film_id" <= 5
INTERSECT
SELECT * FROM "film" where "film_id" >= 2;

4 records
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
2	ACE GOLDFINGER	A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China	2006	1	NA	3	4.99	48	12.99	G	Trailers,Deleted Scenes	2019-04-11 18:11:48
3	ADAPTATION HOLES	A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory	2006	1	NA	7	2.99	50	18.99	NC-17	Trailers,Deleted Scenes	2019-04-11 18:11:48
4	AFFAIR PREJUDICE	A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank	2006	1	NA	5	2.99	117	26.99	G	Commentaries,Behind the Scenes	2019-04-11 18:11:48
5	AFRICAN EGG	A Fast-Paced Documentary of a Pastry Chef And a Dentist who must Pursue a Forensic Psychologist in The Gulf of Mexico	2006	1	NA	6	2.99	130	22.99	G	Deleted Scenes	2019-04-11 18:11:48

SELECT * FROM "film" where "film_id" <= 5
EXCEPT
SELECT * FROM "film" where "film_id" >= 4;

3 records
film_id	title	description	release_year	language_id	original_language_id	rental_duration	rental_rate	length	replacement_cost	rating	special_features	last_update
1	ACADEMY DINOSAUR	A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies	2006	1	NA	6	0.99	86	20.99	PG	Deleted Scenes,Behind the Scenes	2019-04-11 18:11:48
2	ACE GOLDFINGER	A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China	2006	1	NA	3	4.99	48	12.99	G	Trailers,Deleted Scenes	2019-04-11 18:11:48
3	ADAPTATION HOLES	A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory	2006	1	NA	7	2.99	50	18.99	NC-17	Trailers,Deleted Scenes	2019-04-11 18:11:48

Remark: anti-join allows different column names in the tables.

Database Manipulation

# create a local empty SQLite database called mydb.sqlite
mydb <- dbConnect(RSQLite::SQLite(), dbname = "mydb.sqlite")

create table

mydb %>% dbCreateTable(
  "table1",
  tibble(fruit = character(0), count = integer(0))
)
mydb %>% dbReadTable("table1")

CREATE TABLE "table2" (
    "id" int NOT NULL,
    "last_name" varchar(255) NOT NULL,
    "first_name" varchar(255),
    "age" int,
    PRIMARY KEY ("id")
);

There is also dbWriteTable which exports the whole data frame as a table of the database.

delete a table permanently.

DROP TABLE "table2";

mydb %>% dbExecute('DROP TABLE "table2";')

drop_table <- function(db, tname) {
  drop_sql <- sqlInterpolate(db, "DROP TABLE ?tablename;", tablename = tname)
  db %>% dbExecute(drop_sql)
}

mydb %>% drop_table("table1")

Remark: the use of sqlInterpolate is to avoid SQL injection attack. Computerphile has a great video on this topic: https://www.youtube.com/watch?v=ciNHn38EyRc

INSERT – insert one or more rows into a table.

mydb %>% dbAppendTable(
  "table1",
  tibble(fruit = "apple", count = 2))

# alternatively
insert_sql <- mydb %>% sqlAppendTable(
  "table1",
  tibble(fruit = "apple", count = 2), row.names = FALSE)
mydb %>% dbExecute(insert_sql)

mydb %>% dbReadTable("table1")

INSERT INTO 'table2' (id, last_name, first_name, age)
  VALUES (1, "Lai", "Randy", 16);

INSERT INTO 'table2' (id, last_name, first_name)
  VALUES (2, "Lai", "Natalie");

mydb %>% dbReadTable("table2")

UPDATE – update existing data in a table.

mydb %>% dbExecute('UPDATE "table2" SET "age" = 33 WHERE "id" = 1;')

UPDATE "table2" SET "age" = 33 WHERE "id" = 1;

mydb %>% dbReadTable("table2")

DELETE – delete data from a table permanently.

mydb %>% dbExecute('DELETE FROM "table2" WHERE "id" = 1;')

DELETE FROM "table2" WHERE "id" = 1;

mydb %>% dbReadTable("table2")

Reference

SQL Tutorial https://www.sqltutorial.org/

store_id	inventory_id	customer_id
2	6	554
2	9	366
2	21	111
2	25	590
2	70	108
1	81	236
2	97	512
2	106	44
2	112	349
2	177	317

store_id	inventory_id	customer_id
2	6	554
2	9	366
2	21	111
2	25	590
2	70	108
1	81	236
2	97	512
2	106	44
2	112	349
2	177	317

Database and SQL