library(readr)
# 加载数据
df <- read_delim("all_length.tsv", delim = "\t", col_names = TRUE)
# 移除所有包含 "INV" 的行
df_filtered <- df %>%
filter(type != "INV")
df<-df_filtered
# 查看过滤后的数据
head(df)
library(ggplot2)
df_summary <- df %>%
mutate(binned_length = cut(length, breaks = c(0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500))) %>%
group_by(type, group, binned_length) %>%
summarise(count = n()) %>%
ungroup()
# 查看数据
head(df_summary)
write.csv(df_summary, file = "df_summary.csv", row.names = FALSE)
df_summary <- df %>%
mutate(binned_length = cut(length, breaks = c(0, 100, 200, 300, 400, 500,600,700,800,900,1000,5000,10000,100000))) %>%
group_by(type, group, binned_length) %>%
summarise(count = n()) %>%
ungroup()
p=ggplot(df_summary, aes(x = binned_length, y = count, color = type, linetype = group, group = interaction(type, group))) +
geom_line(size = 1) +
labs(title = "Length Distribution by Type and Group",
x = "Length Interval",
y = "Count",
color = "Type",
linetype = "Group") +
scale_color_manual(values = c("INS" = "blue", "DEL" = "red", "DUP" = "green")) +
scale_linetype_manual(values = c("Xian" = "solid", "Geng" = "dashed")) +
theme_minimal()
p
# 保存图
ggsave("length_distribution.png", plot = p, width = 15, height = 8, dpi = 300)
# 计算每个类型、组别和长度区间内的平均值和标准差
df_summary_avg <- df %>%
mutate(binned_length = cut(length, breaks = c(0, 100, 200, 300, 400, 500,600,700,800,900,1000,5000,10000,100000))) %>%
group_by(type, group, binned_length) %>%
summarise(
mean_length = round(mean(length, na.rm = TRUE)),
sd_length = round(sd(length, na.rm = TRUE))
) %>%
ungroup()
# 查看结果
df_summary_avg
write.csv(df_summary_avg, file = "df_summary_avg.csv", row.names = FALSE)
网友评论