# Exploratory analysis of new HGT results 11/05/2014, max (last updated
# september 2015 to incorporate more recent results and put everything
# together)
require(ape)
## Loading required package: ape

### first read in and preprocess data.  note that there are both pvals and
### overall counts (i.e. the observed data of gains of gene 1 in presence of
### gene 2), and the two have non-identical dimensions due to some filtering.

working_dir = "gainLoss_results/MOtree_GLrun"

if (!exists("allps")) {
    allps = read.table(gzfile("processed_data/reproduced_analysis/sim_null_pvals_all.txt.gz"), 
        header = T)
}

load(file.path(working_dir, "Cijmat.Rdat"))

pvals = as.matrix(allps[sort(rownames(allps)), sort(rownames(allps))])
koko = as.matrix(C_ij[rownames(pvals), rownames(pvals)])
pvals = as.matrix(pvals)
koko = as.matrix(koko)

### look at pvals and qvals a little.  for a value c_ij that represents a
### probabilistic count of <1, it's not even worth testing the upper tail for
### that due to the sparsity of the data.  there are probably elegant ways to
### decide which tests are worth doing, but our purpose is fairly heuristic
### here, so we are going to impose an arbitrary cutoff

### filter out observations with <1 count. could go further with the filter,
### but this is simple.
hist(as.matrix(pvals)[koko > 1], 100, xlab = "P-val", main = "Filter all comparisons of <=1 count")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


### a little less than half of tests/observations are thus filtered.  the fdr
### correction gets quite a bit better.
length(koko[koko > 1])
## [1] 13483161
length(koko)
## [1] 25310961
filterps = as.matrix(pvals[koko > 1])

# now estimate qvals
qs = p.adjust(filterps, method = "fdr")
summary(qs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.721   0.950   0.822   1.000   1.000
length(which(qs < 0.01))
## [1] 8415
length(which(qs < 0.05))
## [1] 86719

# figure out the threshold and use that to get an adjacency matrix of the
# network.
qthresh = 0.01
summary(filterps[qs < qthresh])
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.00e+00 0.00e+00 2.00e-06 2.35e-06 4.00e-06 6.00e-06

# window dressing
thresh = max(filterps[qs < qthresh])
adjmat = as.matrix(pvals)
adjmat[pvals <= thresh] = 1
adjmat[pvals > thresh] = 0
cat("total link number =", sum(adjmat), "for raw network\n")
## total link number = 8415 for raw network

rowed = which(rowSums(adjmat) > 0)
coled = which(colSums(adjmat) > 0)
both = unique(append(rowed, coled))
adjmat = adjmat[both, both]

# do a little tidying, writing, reprocessing of files
write.table(adjmat, file.path("processed_data", "raw_hgt_net_adj.txt"), quote = FALSE)
source("code/linklist_adjmat_thin.R")
links = adjmat_to_list(adjmat)
write.table(links, file.path("processed_data", "raw_hgt_net_list.cyto"), quote = FALSE, 
    row.name = FALSE, col.name = FALSE)
# remove(adjmat)
remove(links, pvals, allps, filterps, qs)

# quick and dirty power analysis to show the limits of the approach POWER -
# it takes a long time to run the power analysis (it is basically a
# stripped-down version of the hypothesis test script), but it can be run
# like this: source('code/qd_motree_power.R')
source("code/figure_scripts/powerheatmap_053014_upper.R")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


# also check robustness of reconstruction by comparing to a max parsimony
# reconstruction
source("code/figure_scripts/mp_em_compare_072015.R")
## Loading required package: vioplot
## Loading required package: sm
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
## [1] "visualizations of the correspondence between MP and EM ancestral reconstructions"
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

## [1] "now just simple heuristic to evaluate agreement"
## agreement between EM and MP is  0.9953
# now just simple heuristic to evaluate agreement
matted[, 2] = round(matted[, 2])
agreement = length(which(matted[, 1] == matted[, 2]))/nrow(matted)
cat("agreement between EM and MP is ", agreement, "\n")
## agreement between EM and MP is  0.9953

# at some point need a dag- here is one potential command to get it - hope
# for minimal FAS of 1.  if the net is too much bigger/more complicated,
# could reimplement using igraph for faster network functionality.

# system(paste('python code/arcset_remover.py',
# file.path(working_dir,'raw_hgt_net_list.cyto'), '2',sep=' ')) SERVER
# DOESN'T HAVE NETWORKX INSTALLED SO I AM RUNNING THIS LOCALLY (REQUIRES
# NETWORKX PYTHON PACKAGE)

# reconstruct adjacency matrix of net- now DAG
links = as.matrix(read.table(file.path("processed_data", "dag_hgt_net_list.cyto")))
adjmat[adjmat > 0] = 0
for (link in 1:nrow(links)) {
    adjmat[links[link, 1], links[link, 2]] = 1
}

cat("total link number =", sum(adjmat), "for DAG network\n")
## total link number = 8414 for DAG network

# next, do a transitive reduction.
print("transitive reduction starting")
## [1] "transitive reduction starting"
source("code/transitivereduction.R")

# this takes a LONG TIME. consider just supplying the command and reading
# pre-computed?
adjmat = transReduce(adjmat)

cat("total link number =", sum(adjmat), "for transitive-reduced DAG\n")
## total link number = 8228 for transitive-reduced DAG

save(adjmat, file = file.path("processed_data/hgt_net_dag_transreduced_repro.Rdat"))
links = adjmat_to_list(adjmat)
write.table(links, file.path("processed_data/treduced_hgt_net_list.cyto.repro"), 
    quote = FALSE, row.name = FALSE, col.name = FALSE)

write.table(colnames(adjmat), "processed_data/kos_in_hgtnet_082415.txt", quote = FALSE, 
    row.name = FALSE, col.name = FALSE)
# load('processed_data/hgt_net_dag_transreduced.Rdat') adjmat = tr_adj_mat
print("done with net processing")
## [1] "done with net processing"


## describing the net.  great! things coming along.  now, calculate some very
## basic network stats and look at them.
indeg = colSums(adjmat)
outdeg = rowSums(adjmat)
summary(indeg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00    3.64    1.00  307.00
summary(outdeg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    2.00    3.64    5.00   34.00

### so it looks like in and out degree don't follow the same distribution.
### plot these values.
hist(indeg, 100, xlab = "In-degree")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

hist(outdeg, 50, xlab = "Out-degree")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


### now as log-log.
in_tab = table(indeg)
out_tab = table(outdeg)

plot(log2(as.numeric(names(in_tab))), log2(in_tab), xlab = "Log2(In-degree)", 
    ylab = "Log2(Frequency)", pch = 19)
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

plot(log2(as.numeric(names(out_tab))), log2(out_tab), xlab = "Log2(Out-degree)", 
    ylab = "Log2(Frequency)", pch = 19)
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

### no obvious weirdness there.  but are they super-well explained by the
### parameters of the genes in question?  that would be a bad sign.  read in
### these parameters.

# NEED TO REPLACE THIS REAL_GENES ISH.
# real_genes=read.table(file.path(paste(working_dir,'_null_simed_genes_new',sep=''),'real_gene_bins_1M.txt'),header=T)
pres = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.pres_probs", 
    header = T)
gain = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.gain_probs", 
    header = T)
prevalence = colSums(pres)[names(outdeg)]
gain_num = colSums(gain)[names(outdeg)]
rm(pres, gain)

presout = cor.test(prevalence, outdeg)
print(presout)
## 
##  Pearson's product-moment correlation
## 
## data:  prevalence and outdeg
## t = 6.525, df = 2258, p-value = 8.364e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09534 0.17628
## sample estimates:
##   cor 
## 0.136
cor.test(prevalence, outdeg, method = "spearman")
## Warning: Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  prevalence and outdeg
## S = 1.358e+09, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##    rho 
## 0.2941

gainin = cor.test(gain_num, indeg)
print(gainin)
## 
##  Pearson's product-moment correlation
## 
## data:  gain_num and indeg
## t = 8.281, df = 2258, p-value = 2.22e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1314 0.2114
## sample estimates:
##    cor 
## 0.1717
cor.test(gain_num, indeg, method = "spearman")
## Warning: Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  gain_num and indeg
## S = 927846535, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##    rho 
## 0.5177

### so some slight correlations, but nothing like the crazy previous
### observations.  suggests that we are right that these parameters affect
### POWER, but are not confounders for the detection of edges themselves.

### plot the values. ```{r fig.width=4, fig.height=4}
plot(gain_num, indeg, xlab = "Gain count", ylab = "In-degree", main = gainin$estimate, 
    pch = ".")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

plot(prevalence, outdeg, xlab = "Prevalence", ylab = "Out-degree", main = presout$estimate, 
    pch = ".")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


# and the relationship between the 2 measures of degree?
outin = cor.test(outdeg, indeg)
print(outin)
## 
##  Pearson's product-moment correlation
## 
## data:  outdeg and indeg
## t = -6.949, df = 2258, p-value = 4.789e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1848 -0.1041
## sample estimates:
##     cor 
## -0.1447
cor.test(outdeg, indeg, method = "spearman")
## Warning: Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  outdeg and indeg
## S = 3.181e+09, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##     rho 
## -0.6537
print(outin)
## 
##  Pearson's product-moment correlation
## 
## data:  outdeg and indeg
## t = -6.949, df = 2258, p-value = 4.789e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1848 -0.1041
## sample estimates:
##     cor 
## -0.1447
plot(jitter(outdeg), jitter(indeg), ylab = "In-degree", xlab = "Out-degree", 
    main = outin$estimate, pch = ".")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


# before we get too far ahead of ourselves, let's do some plot some of the
# source data: what is the balance of gains vs. losses? (sanity check)
losso = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.loss_probs", 
    header = TRUE)
lossed = colSums(losso)
rm(losso)

gaino = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.gain_probs", 
    header = TRUE)
gained = colSums(gaino)
rm(gaino)

plot(lossed, gained, xlab = "Number of losses", ylab = "Number of gains", pch = ".", 
    xlim = c(0, 200), ylim = c(0, 200))
# text(100,175,labels=paste('PCC =
# ',round(cor(na.omit(gained),na.omit(lossed)),2),sep=''),cex=1.5)
abline(a = 0, b = 1)
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


# next, what are the distribution of gains and prevalence, which we truly
# study?
hist(gain_num, 20, xlab = "Gains in the tree of a gene (probabilistic counts)")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

hist(prevalence, 40, xlab = "Prevalence in the tree of a gene (probabilistic counts)")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


# what is the expected distribution of gains, with the gainLoss computed
# rates?  (these are precomputed but can be generated using the simulation
# pipeline)
simpres = read.table(gzfile("processed_data/MOtree_apesim_presence.txt.1gain_1loss.022214_rep2.gz"))
hist(simpres[, ], 40, main = "Raw rate simulated gene prevalence", xlab = "Prevalence / gene")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

simgain = read.table(gzfile("processed_data/MOtree_apesim_gain.txt.1gain_1loss.022214_rep2.gz"))
hist(simgain[, ], 20, main = "Raw rate simulated gene gain", xlab = "# gains / gene")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

rm(simpres, simgain)

# show 'spread' of each null distribution
source("code/figure_scripts/calc_null_intervals_073014.R")
## [1] "gaining"
## [1] 98493
## [1] "presing"
## [1] 98493
## [1] 1249
## [1] 1249
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     3.0     6.0     7.1     8.0    48.0 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    2.00    6.19    9.00   29.00

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1


# THE NETWORK REWIRING ANALYSIS IS TOO BIG TO DO HERE.  THE CODE FOR THESE
# IS code/intrapath2_shuffles_061414.R and code/netdist_shuffles_071614.R
# the metabolic network comparison can be done in principle just by
# uncommenting the following lines of code, though it would take maybe 30
# min to finish: note that these shuffling things are also stochastic, so
# the null distribution will vary a little bit from run to run. nonetheless,
# in each case, p should still be small or effectively 0.
# source('code/netdist_shuffles_071614.R')

# will just plot the results here: metabolic net overlap: this is computed
# in the script, just hard-coded here.
true = 78
intersect_size = read.table("processed_data/permdat_082415.txt")
p = length(which(intersect_size >= true))/length(intersect_size[, ])
hist(intersect_size[, ], 30, xlab = "Number of PGCEs", xlim = c(0, 85), main = paste("Pvalue = ", 
    p, sep = ""))
abline(v = true, lty = 2, lwd = 2)
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1


# within-pathway edge counting
permreps = dir("processed_data/intrapath_shuffles/", pattern = "sim_counts")
perms = c()
for (rep in permreps) {
    perm = read.table(file.path("processed_data/intrapath_shuffles/", rep))
    perms = rbind(perms, perm)
}

true = read.table("processed_data/intrapath_shuffles/true_counts_pathway_082415.txt")
p = length(which(perms[, 1] >= true))/nrow(perms)

# plot it
hist(perms[, 1], 20, xlim = c(0, 250), xlab = "Number of PGCEs", main = paste("P-value = ", 
    p))
abline(v = true[1, 1])
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1



# code/intrapath2_shuffles_061414.R is hard-coded to only run 10 shuffles at
# a time for parallelization. i run it in batch mode by means of the shell
# scriptshell_scripts/run_intrapath2_shuf.sh. code/netdist_shuffles_071614.R
# is hard-coded to run everything at once in a single job.  the latter runs
# a bunch of other stats that we didn't end up using as well.

# PATHWAY-PATHWAY ASSOCIATIONS - these are also hard-coded for being run in
# batch mode via shell script shell_scripts/run_findOverPathHGT.sh.

rm(adjmat, koko, C_ij)

# RUBISCO small subunit analysis - first, write out genes from net with an
# edge to rbsS (K01602)
rbsS_genes = as.vector(links[links[, 2] == "K01602", 1])

# this file as written out contains the table S1 info.
write.table(rbsS_genes, "processed_data/reproduced_analysis/rbsS_genes.txt", 
    quote = FALSE, row.name = FALSE, col.name = FALSE)

# analysis of modules to get right granularity of functions - this output
# corresponds to table s2.  this doesn't work very well on my system for
# whatever reason- so i am commenting it out so it is possible to see how it
# is done but it doesn't throw nasty errors. system('python
# code/kegg_enrich_new.py processed_data/reproduced_analysis/rbsS_genes.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/module.parsed120113
# processed_data/reproduced_analysis/rbsS_enrich.txt')

# make the plots of sample rbsS associations
source("code/figure_scripts/plottree_ape_MO_080515.R")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

## [1] 2
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

## [1] 2

# TOPOLOGICAL SORT ANALYSIS again, REQUIRES NETWORKX!! if networkx library
# were available, could uncomment the line below: system('python
# code/toposort_grouping.py processed_data/dag_hgt_net_list.cyto >
# hgt_net_genes_toposorted.txt') in lieu of being able to run that script, i
# will just use a pre-computed dataset to make the figures etc.: the
# toposort, properly processed, can be used to get the file
# processed_data/hgt_toposort_attr_082315.txt, which just maps the genes
# (KEGG KOs) to ranks.
system("python code/toposort_to_rankattr.py processed_data/toposort_090915.txt > processed_data/reproduced_analysis/hgt_toposort_attr_090915.txt")

# a lot of warnings will now print- they are just associated with drawing
# the arcs of the figure.
source("code/figure_scripts/topo_arcdiag_fig4a_072215.R")
## 1 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 2 8.927
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 3 8.777
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 4 9.023
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 5 6.965
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 3 6.324
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 4 6.168
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 5 3.584
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 3 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 4 5.318
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 5 2.708
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 3 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 4 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 5 2.89
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 3 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 4 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 5 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

## [1] "this is table s3:"
##      Rank Number of genes Total out-degree Total in-degree
## [1,]    1            1593             7792               0
## [2,]    2             498              357            2512
## [3,]    3             118               73            2348
## [4,]    4              46                6            2992
## [5,]    5               5                0             376
source("code/figure_scripts/manybee_topofigure.R")

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

source("code/figure_scripts/plot_alltopos_motree_100814.R")
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
source("code/figure_scripts/count_dists_of_gains_fromroot_080515.R")
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## 
##  Pearson's product-moment correlation
## 
## data:  depth[, 1] and (1 - depth[, 2])
## t = -11.17, df = 2208, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2705 -0.1915
## sample estimates:
##     cor 
## -0.2314
## Warning: Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  depth[, 1] and (1 - depth[, 2])
## S = 2.235e+09, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##     rho 
## -0.2423
## Warning: some notches went outside hinges ('box'): maybe set notch=FALSE
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1

## [1] "now plot barplot of same with 95% CIs- probably less informative."
plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-1



# 'ordering' is just a var created in the above that maps genes (KOs) to
# topological rank just hard-coding this for simplicity. a little
# duplicative.  by restricting this to only annotations related to specific
# functions, and only to enrichments (i.e. not depletions), you get Table 1.
# nonetheless, there are some other interesting things here that we don't
# talk about really.  again, the python script fails within this wrapper for
# unknown reasons but i am leaving the code commented.
write.table(rownames(ordering)[ordering[, ] == 1], "processed_data/reproduced_analysis/hgt_toporank1_082315.txt", 
    quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank1_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank1_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank1_082315_enrich')


write.table(rownames(ordering)[ordering[, ] == 2], "processed_data/reproduced_analysis/hgt_toporank2_082315.txt", 
    quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank2_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank2_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank2_082315_enrich')

write.table(rownames(ordering)[ordering[, ] == 3], "processed_data/reproduced_analysis/hgt_toporank3_082315.txt", 
    quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank3_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank3_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank3_082315_enrich')

write.table(rownames(ordering)[ordering[, ] == 4], "processed_data/reproduced_analysis/hgt_toporank4_082315.txt", 
    quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank4_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank4_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank4_082315_enrich')

write.table(rownames(ordering)[ordering[, ] == 5], "processed_data/reproduced_analysis/hgt_toporank5_082315.txt", 
    quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank5_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank5_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank5_082315_enrich')

### PREDICTION these scripts will plot a bunch of the various figures and will
### also generate a lot of text output about the prediction quality gainLoss
### run files are provided in gainLoss_run_files/ and results in
### gainLoss_results/. All the code that I used for their post-processing is
### included but is not run in this script.  the networks and gainLoss
### reconstructions for the training sets can be found as gzipped files
### included.
source("code/figure_scripts/hgt_predict_firm.R")
## [1] "ROC + precision/recall for firmicutes partition"

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

## number of edges: 3703 
## number of predictable genes: 395 
## number of gained genes: 3281
source("code/figure_scripts/hgt_predict_alpha.R")
## [1] "ROC for alpha/betaproteobacteria partition"

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

## number of edges: 1726 
## number of predictable genes: 206 
## number of gained genes: 3505
source("code/figure_scripts/hgt_predict_all.R")
## [1] "ROC for using whole dataset as predictor (overfit, probably)"

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

## number of edges: 8228 
## number of predictable genes: 667 
## number of gained genes: 3281
source("code/figure_scripts/prediction_vioplot.R")
## [1] "U-tests can be used to assess the significance of a predictive classifier (such as we used) in properly classifying data.  Can be thought of as testing for the significance of the AUC, if it were a test statistic."
## [1] "test for significance of prediction for alpha/betaproteobacteria partition"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  as.numeric(alpha_for_roc[alpha_for_roc[, 2] == 0, 1]) and as.numeric(alpha_for_roc[alpha_for_roc[, 2] == 1, 1])
## W = 4848668, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "test for significance of prediction for firmicutes partition"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  as.numeric(firm_for_roc[firm_for_roc[, 2] == 0, 1]) and as.numeric(firm_for_roc[firm_for_roc[, 2] == 1, 1])
## W = 16943431, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "test for significance of prediction for unpartitioned dataset (overfit), predicting gains in firmicutes"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  as.numeric(all_for_roc[all_for_roc[, 2] == 0, 1]) and as.numeric(all_for_roc[all_for_roc[, 2] == 1, 1])
## W = 37225120, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "generates figure from supp (fig s8)"
## [1] "y axis is prediction score. first panel is firmicutes, second is alpha/beta"

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1