代码
library(tidyverse)
library(gt)
library(gtsummary)library(tidyverse)
library(gt)
library(gtsummary)gtsummary包的tbl_summary()函数可以生成统计表,但对于检出限较高,表格中四分位数低于检出限的情况,tbl_summary()函数无法直接处理。这里提供一种解决方法,可以把低于检出限的四分位数用“<LOD”表示。
rm(list = ls())
# 创建一个虚拟的数据集,表示50个样本3个物质的检出浓度,并把这50个样本分为病例和对照两组
set.seed(123)
data <- tibble(
group = factor(sample(c("case", "control"), 50, replace = TRUE)),
A = rnorm(50, 0.5, 0.1),
B = rnorm(50, 0.6, 0.1),
C = rnorm(50, 0.7, 0.1)
)
# 模拟检出限
lod <- tibble(
name = c("A", "B", "C"),
LOD = c(0.5, 0.56, 0.7)
)先用tbl_summary()函数生成四分位数统计表,并在分组的基础上添加总体统计信息和p值。
summary_table <- data |>
tbl_summary(
by = group,
statistic = list(all_continuous() ~ "{median}[{p25}, {p75}]",
all_categorical() ~ "{n} / {N} ({p}%)"),
digits = list(all_continuous() ~ 3)
) |>
add_overall() |>
add_p()summary_table| Characteristic | Overall N = 501 |
case N = 301 |
control N = 201 |
p-value2 |
|---|---|---|---|---|
| A | 0.501[0.451, 0.578] | 0.520[0.431, 0.588] | 0.496[0.466, 0.544] | 0.6 |
| B | 0.590[0.540, 0.652] | 0.575[0.529, 0.630] | 0.622[0.562, 0.653] | 0.14 |
| C | 0.695[0.626, 0.769] | 0.686[0.599, 0.769] | 0.700[0.655, 0.769] | 0.6 |
| 1 Median[Q1, Q3] | ||||
| 2 Wilcoxon rank sum exact test | ||||
gt(lod)| name | LOD |
|---|---|
| A | 0.50 |
| B | 0.56 |
| C | 0.70 |
统计表中,有多个物质的四分位数低于检出限,且没有检出限列和检出率列。之后可以用summarise函数对data手动处理,制作一个用于数据替换和补充的数据框。
# 手搓一个用“<LOD”替换四分位数的数据框----
summary_lod_1 <- data |>
pivot_longer(cols = -group, names_to = "name", values_to = "value") |>
group_by(group, name) |>
summarise(
Q1 = quantile(value, 0.25),
Q2 = quantile(value, 0.5),
Q3 = quantile(value, 0.75)
) |>
mutate(across(Q1:Q3, ~round(., 3))) |>
left_join(lod, by = "name") |>
mutate(across(Q1:Q3, ~ case_when(
. < LOD ~ "<LOD",
TRUE ~ as.character(.)
))) |>
mutate(ci = str_glue("{Q2}[{Q1}, {Q3}]")) |>
select(group, name, ci) |>
pivot_wider(names_from = group, values_from = ci)
summary_lod_1
# 汇总列也需要替换,上述代码不对group分组即可----
summary_lod_2 <- data |>
pivot_longer(cols = -group, names_to = "name", values_to = "value") |>
group_by(name) |>
summarise(
Q1 = quantile(value, 0.25),
Q2 = quantile(value, 0.5),
Q3 = quantile(value, 0.75)
) |>
mutate(across(Q1:Q3, ~round(., 3))) |>
left_join(lod, by = "name") |>
mutate(across(Q1:Q3, ~ case_when(
. < LOD ~ "<LOD",
TRUE ~ as.character(.)
))) |>
mutate(ci = str_glue("{Q2}[{Q1}, {Q3}]")) |>
select(name, ci)
summary_lod_2
# 检出限和检出率列(分组)----
summary_3 <- data |>
pivot_longer(cols = -group, names_to = "name", values_to = "value") |>
group_by(group, name) |>
summarise(
LOD = first(lod$LOD[match(name, lod$name)]),
detect_rate = mean(value > LOD)
) |>
pivot_wider(names_from = group, values_from = detect_rate) |>
rename(det_rate_case = case, det_rate_control = control)
summary_3
# 检出率列(总体)----
summary_4 <- data |>
pivot_longer(cols = -group, names_to = "name", values_to = "value") |>
group_by(name) |>
summarise(
LOD = first(lod$LOD[match(name, lod$name)]),
detect_rate = mean(value > LOD)
) |>
rename(det_rate = detect_rate)
summary_4case_when换成if_else会报错,因为替换结果是字符型,和数值型的原始值不匹配
str_glue()函数把四分位数拼接成字符串,其中{}表示引用变量
summary_table表格一样的形式
match()函数找到对应的检出限
summary_table表格一样的形式
# A tibble: 3 × 3
name case control
<chr> <glue> <glue>
1 A 0.52[<LOD, 0.587] <LOD[<LOD, 0.543]
2 B 0.575[<LOD, 0.629] 0.622[0.563, 0.653]
3 C <LOD[<LOD, 0.765] 0.7[<LOD, 0.766]
# A tibble: 3 × 2
name ci
<chr> <glue>
1 A 0.501[<LOD, 0.576]
2 B 0.59[<LOD, 0.65]
3 C <LOD[<LOD, 0.768]
# A tibble: 3 × 4
name LOD det_rate_case det_rate_control
<chr> <dbl> <dbl> <dbl>
1 A 0.5 0.533 0.45
2 B 0.56 0.633 0.8
3 C 0.7 0.467 0.5
# A tibble: 3 × 3
name LOD det_rate
<chr> <dbl> <dbl>
1 A 0.5 0.5
2 B 0.56 0.7
3 C 0.7 0.48
在gtsummary的表格对象中,有一个table_body属性,可以用$符号提取,这就是表格的主体部分。可以直接对其进行修改,也可以在modify_table_body()函数中修改,其好处是可以用tidyverse的基础函数直接处理。 以下是用modify_table_body()函数修改表格的方法。 同时也用了modify_column_unhide和modify_header()函数修改表头,主要负责修改列名和把新添加的列显示出来。
# 查看表格主体,主要看需要调整的列及其列名与位置关系
summary_table$table_body
# 数据替换
summary_table <- summary_table |>
modify_table_body(\(x) {
x |>
mutate(
stat_0 = summary_lod_2$ci,
stat_1 = summary_lod_1$case,
stat_2 = summary_lod_1$control
) |>
mutate(
stat_3 = summary_3$det_rate_case,
stat_4 = summary_3$det_rate_control,
stat_5 = summary_4$det_rate,
stat_6 = summary_4$LOD
)
}) |>
modify_column_unhide(
columns = c(stat_3, stat_4, stat_5)
) |>
modify_header(
label = '**Variable**',
stat_0 = '**Total**',
stat_3 = '**Detection Rate (Case)**',
stat_4 = '**Detection Rate (Control)**',
stat_5 = '**Detection Rate (Total)**',
stat_6 = '**LOD**'
) |>
# _$table_body
modify_table_body(\(x) {
x |>
relocate(stat_3, .after = stat_1) |>
relocate(stat_4, .after = stat_2) |>
relocate(stat_5, .after = stat_0) |>
relocate(stat_6, .after = label)
})\(x)是function(x)的简写,x是表格的主体部分
modify_column_unhide()函数用于显示隐藏的列,这里显示上一步添加的列
label在modify_header()函数中表示表头的第一列列名
stat_x的对应关系
relocate()函数调整列的位置,.after参数表示在哪一列后面插入,这里把检出率插入到对应95%置信区间的后面
# A tibble: 3 × 14
variable test_name var_type row_type var_label label stat_0 stat_1 stat_2
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 A wilcox.test continuous label A A 0.501[… 0.520… 0.496…
2 B wilcox.test continuous label B B 0.590[… 0.575… 0.622…
3 C wilcox.test continuous label C C 0.695[… 0.686… 0.700…
# ℹ 5 more variables: estimate <dbl>, statistic <dbl>, conf.low <dbl>,
# conf.high <dbl>, p.value <dbl>
summary_table| Variable | LOD | Total1 | Detection Rate (Total) | case N = 301 |
Detection Rate (Case) | control N = 201 |
Detection Rate (Control) | p-value2 |
|---|---|---|---|---|---|---|---|---|
| A | 0.50 | 0.501[<LOD, 0.576] | 0.50 | 0.52[<LOD, 0.587] | 0.5333333 | <LOD[<LOD, 0.543] | 0.45 | 0.6 |
| B | 0.56 | 0.59[<LOD, 0.65] | 0.70 | 0.575[<LOD, 0.629] | 0.6333333 | 0.622[0.563, 0.653] | 0.80 | 0.14 |
| C | 0.70 | <LOD[<LOD, 0.768] | 0.48 | <LOD[<LOD, 0.765] | 0.4666667 | 0.7[<LOD, 0.766] | 0.50 | 0.6 |
| 1 Median[Q1, Q3] | ||||||||
| 2 Wilcoxon rank sum exact test | ||||||||
可以看到,还有一些细节问题可以优化,除了用gtsummary的函数,也可以在使用as_gt()函数之后,用gt包的函数进行修改。
summary_table |>
as_gt() |>
fmt_number(columns = everything(), decimals = 3) -> summary_tableas_gt()函数把gtsummary的表格对象转换成gt的表格对象
fmt_number()函数格式化数字,保留三位小数
summary_table| Variable | LOD | Total1 | Detection Rate (Total) | case N = 301 |
Detection Rate (Case) | control N = 201 |
Detection Rate (Control) | p-value2 |
|---|---|---|---|---|---|---|---|---|
| A | 0.500 | 0.501[<LOD, 0.576] | 0.500 | 0.52[<LOD, 0.587] | 0.533 | <LOD[<LOD, 0.543] | 0.450 | 0.631 |
| B | 0.560 | 0.59[<LOD, 0.65] | 0.700 | 0.575[<LOD, 0.629] | 0.633 | 0.622[0.563, 0.653] | 0.800 | 0.141 |
| C | 0.700 | <LOD[<LOD, 0.768] | 0.480 | <LOD[<LOD, 0.765] | 0.467 | 0.7[<LOD, 0.766] | 0.500 | 0.617 |
| 1 Median[Q1, Q3] | ||||||||
| 2 Wilcoxon rank sum exact test | ||||||||