#!/usr/bin/env Rscript

#./bin/fa_rotate --out out.fasta --dict db/assembly_starts.faa tests/r19b16.hdr.fasta

#-#-#-#-#-#-#-#-#-#-#-#-#
# Argument parsing
#-#-#-#-#-#-#-#-#-#-#-#-#
library(optparse)
option_list <- list( 
    make_option("--dict",help="A multi-FASTA with starting protein sequences [%default]",default=fs::path(Sys.getenv("FATOOLS_DIR","."),"db/DnaA_RepA_db_for_assembly_rotation.faa")),
    #make_option("--no-topology", help="If set, try to rotate all sequence. Otherwise rotate only sequences with [topology=circular] in their header line",action = "store_true",default = FALSE),
    make_option("--out", help="Name of the output FASTA file [required]")
)
opt <- parse_args(
	OptionParser(
		description="BLAST a set of start-sequences to a FASTA file (e.g. dnaA, repA). 
		Adapt orientiation of the sequences according to the best hit, and rotate 
		circular sequences (with header tag [topology=circular]).",
		usage = "%prog [options] input.fasta",
		epilogue = "Example usage:
      %prog assembly.fasta
		",
		option_list = option_list
	),
	positional_arguments = 1
)
if (is.null(opt$options$out)) stop("missing --out argument")
if (is.null(opt$options$dict)) stop("missing --dict argument")



#-#-#-#-#-#-#-#-#-#-#-#-#
# Script
#-#-#-#-#-#-#-#-#-#-#-#-#
suppressPackageStartupMessages({
	source(fs::path(Sys.getenv("FATOOLS_DIR","."),"bin/lib_fa.R"))
  library(IRanges)
  library(GenomicRanges)
})



# Align amino-acid database of starting points on the assembly FASTA
run_tblastn <- function(db_faa,seq_fna) {
	tblastn_file <- fs::file_temp(ext = ".tblastn")
	cmd <- str_glue("tblastn -subject_besthit -query {db_faa} -qcov_hsp_perc 60 -subject {seq_fna} -outfmt '6 std qlen slen' -out {tblastn_file}")
	rlang::inform(c("Run TBLASTN command","i"=cmd))
	system(cmd)
	read_blast_fmt6(tblastn_file)
}





#-#-#-#-#-#-#-#-#-#-#-#-#
# Main
#-#-#-#-#-#-#-#-#-#-#-#-#

# Run TBLASTN against the reference database and determine best TBLASTN hit
best_hits <- run_tblastn(opt$options$dict,opt$args[1]) 

# For degbugging
#save.image(file="checkpoint.RData");quit(save = "no")
#load(file="checkpoint.RData")
# ggplot(best_hits) + 
# 	geom_segment(aes(y=subject_id,yend=subject_id,x=0,xend = subject_len),data=~distinct(.,subject_id,subject_len)) +
# 	geom_segment(aes(y=subject_id,yend=subject_id,x=subject_start,xend = subject_end),linewidth=8)
	


best_hits <- best_hits |>
	group_by(subject_id) |>
	slice_min(evalue,n=1,with_ties = FALSE) |>
	mutate(strand = case_when(
		subject_end < subject_start ~ "-",
		TRUE ~ "+"
	))

# Load fasta
fa <- readDNAStringSet(opt$args[1])

# parse FASTA header line
hdr <- fa_parse_header_line(names(fa))


# Compute rotation point and strand reversal orientation so that we
# reverse-complement FASTA sequences no matter their topology as long as they have a minus-strand hit
# and rotate circular sequences if they have a hit
i <- match(hdr$seq_id,best_hits$subject_id)
hdr$tags <- hdr$tags |>
	mutate(rotate.hit = str_c(best_hits$query_id[i],":",best_hits$query_start[i],"-",best_hits$query_end[i])) |>
	mutate(rotate.reversed = if_else(best_hits$strand[i] %in% "-","Y","N")) |>
	mutate(rotate.pos = case_when(
		is.na(i) ~ 1, # no hit => no rotation
		!(topology %in% "circular") ~ 1, # not a circular topology => no rotation
		rotate.reversed %in% "Y" ~ best_hits$subject_end[i],
		rotate.reversed %in% "N" ~ best_hits$subject_start[i],
		TRUE ~ 1
	))


# Rotate and reverse-complement sequences
fa <- xscat(
	subseq(fa,start=hdr$tags$rotate.pos),
	subseq(fa,end=pmax(hdr$tags$rotate.pos-1L,0L))
)
fa[hdr$tags$rotate.reversed %in% "Y"] <- reverseComplement(fa[hdr$tags$rotate.reversed %in% "Y"])


# Update header line
names(fa) <- str_glue("{hdr$seq_id} {fa_tags_to_str(hdr$tags)} {hdr$title}")

# Write output
writeXStringSet(fa,opt$options$out)






