

library(tidyverse)
library(Biostrings)


#' Parse FASTA header line
#'
#' @param hdr_lines a character vector containing FASTA header lines
#'
#' @return a data.frame of parsed elements (one row per header line)
#' @export
#'
#' @examples
#' fa_parse_header_line(c("contig_1 [topology=linear] [b=5] this is contig1","contig_2 [topology=circular] [coverage=50,10,10] this is contig2"))
fa_parse_header_line <- function(hdr_lines) {
	# regex-pattern for modifier and header line
	tag_pat <- "((\\[([^=]+)=([^\\]]+)\\])|(([^=]+)=([^\\s]+)))"
	hdr_pat <- str_glue("^([^\\s]+)((\\s*{tag_pat})*)\\s*(.*)$")
	if (!all(str_detect(hdr_lines,hdr_pat))) rlang::warn("Some header lines are malformed")
	
	headers <- tibble(full_header = hdr_lines) |>
		mutate(
			seq_id = str_extract(full_header,hdr_pat,group=1),
			tags_str = str_trim(str_extract(full_header,hdr_pat,group=2)),
			title = str_extract(full_header,hdr_pat,group=11)
		) |>
		relocate(seq_id)
	if (any(duplicated(headers$seq_id))) rlang::abort("FASTA contains duplicated seq_id")
	
	# Extract key-value pairs from modifiers
	tags <- select(headers,seq_id,tags_str) |>
		mutate(tags = str_extract_all(tags_str,tag_pat),tags_str = NULL) |>
		unnest(tags) |>
		mutate(tags = str_trim(tags)) |>
		# TODO: use coalesce() here ?
		mutate(
			name = if_else(is.na(str_extract(tags,tag_pat,3)),str_extract(tags,tag_pat,6),str_extract(tags,tag_pat,3)),
			value = if_else(is.na(str_extract(tags,tag_pat,4)),str_extract(tags,tag_pat,7),str_extract(tags,tag_pat,4)),
			tags = NULL
		)
	
	if (any(summarize(tags,.by=c(seq_id,name),any(n()>1)) |> pull())) rlang::abort("Some modifiers are set several times in the same FASTA header")
	tags <- pivot_wider(tags,id_cols="seq_id")
	tags <- left_join(select(headers,seq_id),tags,by="seq_id",relationship = "one-to-one")
	
	add_column(headers,tags=tags) |>
		select(!tags_str)
}


fa_tags_standardize <- function(tags) {
	# Compute topology
	tags <- tags |>
		mutate(across(any_of("circular"),~factor(.,c("true","false"),c("circular","linear")))) |>
		mutate(across(any_of("flye.is_circular"),~factor(.,c("Y","N"),c("circular","linear")))) |>
		mutate(.missing = "linear") %>% # default value
		mutate(topology = coalesce(!!!select(.,any_of(c("topology","circular","flye.is_circular",".missing"))))) |>
		select(-any_of(c("circular","flye.is_circular",".missing"))) |>
		relocate(topology)
	# tags <- tags |>
	# 	select(any_of(c("topology","flye.coverage")))
	return(tags)
}



#' Take a tibble and generate the tags strings that can be used in fasta headers
fa_tags_to_str <- function(tags) {
	tags <- select(tags,!any_of("seq_id"))
	if (ncol(tags)>0) {
		mutate(tags,across(everything(),~sprintf("[%s=%s]",cur_column(),replace_na(.,"")))) |>
			unite("tag_str",sep=" ") |> 
			pull("tag_str")
	} else {
		mutate(tags,tag_str="") |> pull("tag_str")
	}
}




#' Read a FLYEINFO table
#'
#' @param f a character vector containing path to flyeinfo file
#'
#' @return a data.frame of parsed elements
#' @importFrom readr read_tsv
#' @importFrom fct forcats
#' @importFrom mutate dplyr
#' @export
read_flye_info <- function(f) {
	read_tsv(f,col_names = c("seq_id","length","coverage","is_circular","is_repetitive","multiplicity","alternative_group","graph_path")) |>
		mutate(is_circular = fct(is_circular,c("Y","N")))
}


#' Read a TBLASTN output
#'
#' @param f a character vector containing path to blast output file generated with parameter "-outfmt '6 std qlen slen'"
#'
#' @return a DataFrame
#' @importFrom readr read_tsv
#' @export
read_blast_fmt6 <- function(f) {
	x <- read_tsv(f,col_names = c("query_id","subject_id","pct_ident","align_len","num_mismatch","num_gapopen","query_start","query_end","subject_start","subject_end","evalue","bit_score","query_len","subject_len"))
	x
}







