Skip to content
Snippets Groups Projects
Commit 93b6c397 authored by Julien.Prados's avatar Julien.Prados
Browse files

make a generic lib_fa.R library

parent 5a1d83c7
No related branches found
No related tags found
No related merge requests found
Pipeline #98454 failed
......@@ -12,8 +12,6 @@ build:
script:
- buildah images
- echo $IMAGE_TAG
- ls
- buildah build -t $IMAGE_TAG
- buildah images
- buildah push $IMAGE_TAG
......
......@@ -9,7 +9,11 @@ RUN apt-get update && apt-get install -y \
&& rm -rf /var/cache/apt/* /var/lib/apt/lists/*;
RUN install2.r --error --skipinstalled --ncpus -1 \
optparse \
BiocManager \
&& Rscript -e 'BiocManager::install(c("Biostrings","GenomicRanges"))' &&
&& rm -rf /tmp/downloaded_packages
ENV FATOOLS_DIR="/app"
COPY db/ /app/db
COPY bin/ /app/bin
......
......@@ -15,11 +15,8 @@ docker run --rm -v .:/cwd registry.gitlab.unige.ch/amr-genomics/fatools fa_rotat
```
# Building
To build the container, run:
```bash
docker build --platform linux/amd64 -t registry.gitlab.unige.ch/amr-genomics/fatools ./
```
# FASTA format
For FASTA header format consult: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/
#!/usr/bin/env Rscript
# For FASTA header format consult
#https://www.ncbi.nlm.nih.gov/genbank/fastaformat/
# Example usage:
# ./bin/fa_reheader --prefix r34b14. --flyeinfo out/r34b14.fasta.flyeinfo out/r34b14.fasta
# ./bin/fa_reheader --prefix r10b14. out/r10b14.fasta
install_dir <- Sys.getenv("FATOOLS_DIR")
#-#-#-#-#-#-#-#-#-#-#-#-#
# Argument parsing
......@@ -20,73 +13,28 @@ option_list <- list(
make_option("--tsv", help="Path to a tab-separated table with additional sequence metadata. The file must contain a header line with `seq_id` column."),
make_option("--prefix", help="A prefix to add to every seq_id",default = "")
)
opt <- parse_args(OptionParser(description="Change headers lines in a FASTA file",option_list=option_list),positional_arguments = 1)
opt <- parse_args(
OptionParser(
description = "Change headers lines in a FASTA file",
usage = "%prog [options] input.fasta",
epilogue = "Example usage:
%prog --prefix r34b14. --flyeinfo out/r34b14.fasta.flyeinfo out/r34b14.fasta
%prog --prefix r10b14. out/r10b14.fasta
",
option_list = option_list
),
positional_arguments = 1
)
#-#-#-#-#-#-#-#-#-#-#-#-#
# Script
#-#-#-#-#-#-#-#-#-#-#-#-#
suppressPackageStartupMessages({
library(Biostrings)
library(tidyverse)
source(fs::path(install_dir,"bin/lib_fa.R"))
})
# Read FLYE info file
read_flye_info <- function(f) {
read.table(
f,sep="\t",header=FALSE,
col.names=c("seq_id","length","coverage","is_circular","is_repetitive","multiplicity","alternative_group","graph_path")
) |>
mutate(is_circular = fct(is_circular,c("Y","N")))
}
#' Parse FASTA header line
#'
#' @param hdr_lines a character vector containing FASTA header lines
#'
#' @return a data.frame of parsed elements
#' @export
#'
#' @examples
#' fa_parse_header_line(c("contig_1 [topology=linear] [b=5] this is contig1","contig_2 [topology=circular] [coverage=50,10,10] this is contig2"))
fa_parse_header_line <- function(hdr_lines) {
# regex-pattern for modifier and header line
tag_pat <- "((\\[([^=]+)=([^\\]]+)\\])|(([^=]+)=([^\\s]+)))"
hdr_pat <- str_glue("^([^\\s]+)((\\s*{tag_pat})*)\\s*(.*)$")
if (!all(str_detect(hdr_lines,hdr_pat))) rlang::warn("Some header lines are malformed")
headers <- tibble(full_header = hdr_lines) |>
mutate(
seq_id = str_extract(full_header,hdr_pat,group=1),
tags_str = str_trim(str_extract(full_header,hdr_pat,group=2)),
title = str_extract(full_header,hdr_pat,group=11)
) |>
relocate(seq_id)
stopifnot("FASTA contains duplicated seq_id" = all(!duplicated(headers$seq_id)))
# Extract key-value pairs from modifiers
tags <- select(headers,seq_id,tags_str) |>
mutate(tags = str_extract_all(tags_str,tag_pat),tags_str = NULL) |>
unnest(tags) |>
mutate(tags = str_trim(tags)) |>
# TODO: use coalesce() here ?
mutate(
name = if_else(is.na(str_extract(tags,tag_pat,3)),str_extract(tags,tag_pat,6),str_extract(tags,tag_pat,3)),
value = if_else(is.na(str_extract(tags,tag_pat,4)),str_extract(tags,tag_pat,7),str_extract(tags,tag_pat,4)),
tags = NULL
)
stopifnot("Some modifiers are set several times in the same header" = !(summarize(tags,.by=c(seq_id,name),any(n()>1)) |> pull() |> any()))
tags <- pivot_wider(tags,id_cols="seq_id")
tags <- left_join(select(headers,seq_id),tags,by="seq_id",relationship = "one-to-one")
add_column(headers,tags=tags) |>
select(!tags_str)
}
simplify_seq_id <- function(seq_id) {
str_replace(seq_id,"^contig_","")
}
......@@ -122,8 +70,6 @@ tags_std <- function(tags) {
# Read FASTA
dna <- readDNAStringSet(opt$args)
#dna <- readDNAStringSet("out/r10b14.fasta")
#dna <- readDNAStringSet("out/r34b14.fasta")
# Parse header
headers <- fa_parse_header_line(names(dna))
......@@ -138,7 +84,7 @@ if (!is.null(opt$options$flyeinfo)) {
# Add TSV info if available
if (!is.null(opt$options$tsv)) {
tsv <- read.table(opt$options$tsv,sep="\t",comment="",quote="",header=TRUE)
tsv <- read_tsv(opt$options$tsv,comment="",quote="")
stopifnot("no `seq_id` column found in given TSV" = "seq_id" %in% names(tsv))
headers$tags <- left_join(headers$tags,tsv,by = "seq_id", relationship = "one-to-one", suffix = c("",".tsv"))
}
......
library(tidyverse)
library(Biostrings)
#' Parse FASTA header line
#'
#' @param hdr_lines a character vector containing FASTA header lines
#'
#' @return a data.frame of parsed elements
#' @export
#'
#' @examples
#' fa_parse_header_line(c("contig_1 [topology=linear] [b=5] this is contig1","contig_2 [topology=circular] [coverage=50,10,10] this is contig2"))
fa_parse_header_line <- function(hdr_lines) {
# regex-pattern for modifier and header line
tag_pat <- "((\\[([^=]+)=([^\\]]+)\\])|(([^=]+)=([^\\s]+)))"
hdr_pat <- str_glue("^([^\\s]+)((\\s*{tag_pat})*)\\s*(.*)$")
if (!all(str_detect(hdr_lines,hdr_pat))) rlang::warn("Some header lines are malformed")
headers <- tibble(full_header = hdr_lines) |>
mutate(
seq_id = str_extract(full_header,hdr_pat,group=1),
tags_str = str_trim(str_extract(full_header,hdr_pat,group=2)),
title = str_extract(full_header,hdr_pat,group=11)
) |>
relocate(seq_id)
stopifnot("FASTA contains duplicated seq_id" = all(!duplicated(headers$seq_id)))
# Extract key-value pairs from modifiers
tags <- select(headers,seq_id,tags_str) |>
mutate(tags = str_extract_all(tags_str,tag_pat),tags_str = NULL) |>
unnest(tags) |>
mutate(tags = str_trim(tags)) |>
# TODO: use coalesce() here ?
mutate(
name = if_else(is.na(str_extract(tags,tag_pat,3)),str_extract(tags,tag_pat,6),str_extract(tags,tag_pat,3)),
value = if_else(is.na(str_extract(tags,tag_pat,4)),str_extract(tags,tag_pat,7),str_extract(tags,tag_pat,4)),
tags = NULL
)
stopifnot("Some modifiers are set several times in the same header" = !(summarize(tags,.by=c(seq_id,name),any(n()>1)) |> pull() |> any()))
tags <- pivot_wider(tags,id_cols="seq_id")
tags <- left_join(select(headers,seq_id),tags,by="seq_id",relationship = "one-to-one")
add_column(headers,tags=tags) |>
select(!tags_str)
}
#' Read a FLYEINFO table
#'
#' @param f a character vector containing path to flyeinfo file
#'
#' @return a data.frame of parsed elements
#' @importFrom readr read_tsv
#' @importFrom fct forcats
#' @importFrom mutate dplyr
#' @export
read_flye_info <- function(f) {
read_tsv(f,col_names = c("seq_id","length","coverage","is_circular","is_repetitive","multiplicity","alternative_group","graph_path")) |>
mutate(is_circular = fct(is_circular,c("Y","N")))
}
#' Read a TBLASTN output
#'
#' @param f a character vector containing path to tblastn file
#'
#' @return a DataFrame
#' @importFrom readr read_tsv
#' @export
read_tblastn <- function(f) {
x <- read_tsv(f,col_names = c("query_id","subject_id","pct_ident","align_len","num_mismatch","num_gapopen","query_start","query_end","subject_start","subject_end","evalue","bit_score","query_len","subject_len"))
x
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment