长度分布折线图

作者: 花生学生信 | 来源:发表于2024-08-11 19:45 被阅读0次

统计fastq长度分布
如何统计测序文件的reads长度分布
如何在Power BI中制作正态分布图
小 RNA 长度分布统计
玉米基因组的一些信息记录
《用图表说话》-频率分布柱形图20160615
python可视化
3. 数据的概括性度量
[26] 《R数据科学》分类变量和连续变量
fasta/fastq序列长度分布统计

library(readr)

# 加载数据
df <- read_delim("all_length.tsv", delim = "\t", col_names = TRUE)

# 移除所有包含 "INV" 的行
df_filtered <- df %>% 
  filter(type != "INV")

df<-df_filtered

# 查看过滤后的数据
head(df)
library(ggplot2)

df_summary <- df %>%
  mutate(binned_length = cut(length, breaks = c(0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500))) %>%
  group_by(type, group, binned_length) %>%
  summarise(count = n()) %>%
  ungroup()
# 查看数据
head(df_summary)



write.csv(df_summary, file = "df_summary.csv", row.names = FALSE)


df_summary <- df %>%
  mutate(binned_length = cut(length, breaks = c(0, 100, 200, 300, 400, 500,600,700,800,900,1000,5000,10000,100000))) %>%
  group_by(type, group, binned_length) %>%
  summarise(count = n()) %>%
  ungroup()

p=ggplot(df_summary, aes(x = binned_length, y = count, color = type, linetype = group, group = interaction(type, group))) +
  geom_line(size = 1) +
  labs(title = "Length Distribution by Type and Group",
       x = "Length Interval",
       y = "Count",
       color = "Type",
       linetype = "Group") +
  scale_color_manual(values = c("INS" = "blue", "DEL" = "red", "DUP" = "green")) +
  scale_linetype_manual(values = c("Xian" = "solid", "Geng" = "dashed")) +
  theme_minimal()

p



# 保存图
ggsave("length_distribution.png", plot = p, width = 15, height = 8, dpi = 300)



# 计算每个类型、组别和长度区间内的平均值和标准差
df_summary_avg <- df %>%
  mutate(binned_length = cut(length, breaks = c(0, 100, 200, 300, 400, 500,600,700,800,900,1000,5000,10000,100000))) %>%
  group_by(type, group, binned_length) %>%
  summarise(
    mean_length = round(mean(length, na.rm = TRUE)),
    sd_length = round(sd(length, na.rm = TRUE))
  ) %>%
  ungroup()

# 查看结果
df_summary_avg
write.csv(df_summary_avg, file = "df_summary_avg.csv", row.names = FALSE)