# Exploratory analysis of new HGT results 11/05/2014, max (last updated
# september 2015 to incorporate more recent results and put everything
# together)
require(ape)
## Loading required package: ape
### first read in and preprocess data. note that there are both pvals and
### overall counts (i.e. the observed data of gains of gene 1 in presence of
### gene 2), and the two have non-identical dimensions due to some filtering.
working_dir = "gainLoss_results/MOtree_GLrun"
if (!exists("allps")) {
allps = read.table(gzfile("processed_data/reproduced_analysis/sim_null_pvals_all.txt.gz"),
header = T)
}
load(file.path(working_dir, "Cijmat.Rdat"))
pvals = as.matrix(allps[sort(rownames(allps)), sort(rownames(allps))])
koko = as.matrix(C_ij[rownames(pvals), rownames(pvals)])
pvals = as.matrix(pvals)
koko = as.matrix(koko)
### look at pvals and qvals a little. for a value c_ij that represents a
### probabilistic count of <1, it's not even worth testing the upper tail for
### that due to the sparsity of the data. there are probably elegant ways to
### decide which tests are worth doing, but our purpose is fairly heuristic
### here, so we are going to impose an arbitrary cutoff
### filter out observations with <1 count. could go further with the filter,
### but this is simple.
hist(as.matrix(pvals)[koko > 1], 100, xlab = "P-val", main = "Filter all comparisons of <=1 count")
plot of chunk unnamed-chunk-1
### a little less than half of tests/observations are thus filtered. the fdr
### correction gets quite a bit better.
length(koko[koko > 1])
## [1] 13483161
length(koko)
## [1] 25310961
filterps = as.matrix(pvals[koko > 1])
# now estimate qvals
qs = p.adjust(filterps, method = "fdr")
summary(qs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.721 0.950 0.822 1.000 1.000
length(which(qs < 0.01))
## [1] 8415
length(which(qs < 0.05))
## [1] 86719
# figure out the threshold and use that to get an adjacency matrix of the
# network.
qthresh = 0.01
summary(filterps[qs < qthresh])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 0.00e+00 2.00e-06 2.35e-06 4.00e-06 6.00e-06
# window dressing
thresh = max(filterps[qs < qthresh])
adjmat = as.matrix(pvals)
adjmat[pvals <= thresh] = 1
adjmat[pvals > thresh] = 0
cat("total link number =", sum(adjmat), "for raw network\n")
## total link number = 8415 for raw network
rowed = which(rowSums(adjmat) > 0)
coled = which(colSums(adjmat) > 0)
both = unique(append(rowed, coled))
adjmat = adjmat[both, both]
# do a little tidying, writing, reprocessing of files
write.table(adjmat, file.path("processed_data", "raw_hgt_net_adj.txt"), quote = FALSE)
source("code/linklist_adjmat_thin.R")
links = adjmat_to_list(adjmat)
write.table(links, file.path("processed_data", "raw_hgt_net_list.cyto"), quote = FALSE,
row.name = FALSE, col.name = FALSE)
# remove(adjmat)
remove(links, pvals, allps, filterps, qs)
# quick and dirty power analysis to show the limits of the approach POWER -
# it takes a long time to run the power analysis (it is basically a
# stripped-down version of the hypothesis test script), but it can be run
# like this: source('code/qd_motree_power.R')
source("code/figure_scripts/powerheatmap_053014_upper.R")
plot of chunk unnamed-chunk-1
# also check robustness of reconstruction by comparing to a max parsimony
# reconstruction
source("code/figure_scripts/mp_em_compare_072015.R")
## Loading required package: vioplot
## Loading required package: sm
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
## [1] "visualizations of the correspondence between MP and EM ancestral reconstructions"
plot of chunk unnamed-chunk-1
## [1] "now just simple heuristic to evaluate agreement"
## agreement between EM and MP is 0.9953
# now just simple heuristic to evaluate agreement
matted[, 2] = round(matted[, 2])
agreement = length(which(matted[, 1] == matted[, 2]))/nrow(matted)
cat("agreement between EM and MP is ", agreement, "\n")
## agreement between EM and MP is 0.9953
# at some point need a dag- here is one potential command to get it - hope
# for minimal FAS of 1. if the net is too much bigger/more complicated,
# could reimplement using igraph for faster network functionality.
# system(paste('python code/arcset_remover.py',
# file.path(working_dir,'raw_hgt_net_list.cyto'), '2',sep=' ')) SERVER
# DOESN'T HAVE NETWORKX INSTALLED SO I AM RUNNING THIS LOCALLY (REQUIRES
# NETWORKX PYTHON PACKAGE)
# reconstruct adjacency matrix of net- now DAG
links = as.matrix(read.table(file.path("processed_data", "dag_hgt_net_list.cyto")))
adjmat[adjmat > 0] = 0
for (link in 1:nrow(links)) {
adjmat[links[link, 1], links[link, 2]] = 1
}
cat("total link number =", sum(adjmat), "for DAG network\n")
## total link number = 8414 for DAG network
# next, do a transitive reduction.
print("transitive reduction starting")
## [1] "transitive reduction starting"
source("code/transitivereduction.R")
# this takes a LONG TIME. consider just supplying the command and reading
# pre-computed?
adjmat = transReduce(adjmat)
cat("total link number =", sum(adjmat), "for transitive-reduced DAG\n")
## total link number = 8228 for transitive-reduced DAG
save(adjmat, file = file.path("processed_data/hgt_net_dag_transreduced_repro.Rdat"))
links = adjmat_to_list(adjmat)
write.table(links, file.path("processed_data/treduced_hgt_net_list.cyto.repro"),
quote = FALSE, row.name = FALSE, col.name = FALSE)
write.table(colnames(adjmat), "processed_data/kos_in_hgtnet_082415.txt", quote = FALSE,
row.name = FALSE, col.name = FALSE)
# load('processed_data/hgt_net_dag_transreduced.Rdat') adjmat = tr_adj_mat
print("done with net processing")
## [1] "done with net processing"
## describing the net. great! things coming along. now, calculate some very
## basic network stats and look at them.
indeg = colSums(adjmat)
outdeg = rowSums(adjmat)
summary(indeg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 3.64 1.00 307.00
summary(outdeg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 2.00 3.64 5.00 34.00
### so it looks like in and out degree don't follow the same distribution.
### plot these values.
hist(indeg, 100, xlab = "In-degree")
plot of chunk unnamed-chunk-1
hist(outdeg, 50, xlab = "Out-degree")
plot of chunk unnamed-chunk-1
### now as log-log.
in_tab = table(indeg)
out_tab = table(outdeg)
plot(log2(as.numeric(names(in_tab))), log2(in_tab), xlab = "Log2(In-degree)",
ylab = "Log2(Frequency)", pch = 19)
plot of chunk unnamed-chunk-1
plot(log2(as.numeric(names(out_tab))), log2(out_tab), xlab = "Log2(Out-degree)",
ylab = "Log2(Frequency)", pch = 19)
plot of chunk unnamed-chunk-1
### no obvious weirdness there. but are they super-well explained by the
### parameters of the genes in question? that would be a bad sign. read in
### these parameters.
# NEED TO REPLACE THIS REAL_GENES ISH.
# real_genes=read.table(file.path(paste(working_dir,'_null_simed_genes_new',sep=''),'real_gene_bins_1M.txt'),header=T)
pres = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.pres_probs",
header = T)
gain = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.gain_probs",
header = T)
prevalence = colSums(pres)[names(outdeg)]
gain_num = colSums(gain)[names(outdeg)]
rm(pres, gain)
presout = cor.test(prevalence, outdeg)
print(presout)
##
## Pearson's product-moment correlation
##
## data: prevalence and outdeg
## t = 6.525, df = 2258, p-value = 8.364e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.09534 0.17628
## sample estimates:
## cor
## 0.136
cor.test(prevalence, outdeg, method = "spearman")
## Warning: Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: prevalence and outdeg
## S = 1.358e+09, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2941
gainin = cor.test(gain_num, indeg)
print(gainin)
##
## Pearson's product-moment correlation
##
## data: gain_num and indeg
## t = 8.281, df = 2258, p-value = 2.22e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1314 0.2114
## sample estimates:
## cor
## 0.1717
cor.test(gain_num, indeg, method = "spearman")
## Warning: Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: gain_num and indeg
## S = 927846535, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5177
### so some slight correlations, but nothing like the crazy previous
### observations. suggests that we are right that these parameters affect
### POWER, but are not confounders for the detection of edges themselves.
### plot the values. ```{r fig.width=4, fig.height=4}
plot(gain_num, indeg, xlab = "Gain count", ylab = "In-degree", main = gainin$estimate,
pch = ".")
plot of chunk unnamed-chunk-1
plot(prevalence, outdeg, xlab = "Prevalence", ylab = "Out-degree", main = presout$estimate,
pch = ".")
plot of chunk unnamed-chunk-1
# and the relationship between the 2 measures of degree?
outin = cor.test(outdeg, indeg)
print(outin)
##
## Pearson's product-moment correlation
##
## data: outdeg and indeg
## t = -6.949, df = 2258, p-value = 4.789e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1848 -0.1041
## sample estimates:
## cor
## -0.1447
cor.test(outdeg, indeg, method = "spearman")
## Warning: Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: outdeg and indeg
## S = 3.181e+09, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.6537
print(outin)
##
## Pearson's product-moment correlation
##
## data: outdeg and indeg
## t = -6.949, df = 2258, p-value = 4.789e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1848 -0.1041
## sample estimates:
## cor
## -0.1447
plot(jitter(outdeg), jitter(indeg), ylab = "In-degree", xlab = "Out-degree",
main = outin$estimate, pch = ".")
plot of chunk unnamed-chunk-1
# before we get too far ahead of ourselves, let's do some plot some of the
# source data: what is the balance of gains vs. losses? (sanity check)
losso = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.loss_probs",
header = TRUE)
lossed = colSums(losso)
rm(losso)
gaino = read.table("gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.gain_probs",
header = TRUE)
gained = colSums(gaino)
rm(gaino)
plot(lossed, gained, xlab = "Number of losses", ylab = "Number of gains", pch = ".",
xlim = c(0, 200), ylim = c(0, 200))
# text(100,175,labels=paste('PCC =
# ',round(cor(na.omit(gained),na.omit(lossed)),2),sep=''),cex=1.5)
abline(a = 0, b = 1)
plot of chunk unnamed-chunk-1
# next, what are the distribution of gains and prevalence, which we truly
# study?
hist(gain_num, 20, xlab = "Gains in the tree of a gene (probabilistic counts)")
plot of chunk unnamed-chunk-1
hist(prevalence, 40, xlab = "Prevalence in the tree of a gene (probabilistic counts)")
plot of chunk unnamed-chunk-1
# what is the expected distribution of gains, with the gainLoss computed
# rates? (these are precomputed but can be generated using the simulation
# pipeline)
simpres = read.table(gzfile("processed_data/MOtree_apesim_presence.txt.1gain_1loss.022214_rep2.gz"))
hist(simpres[, ], 40, main = "Raw rate simulated gene prevalence", xlab = "Prevalence / gene")
plot of chunk unnamed-chunk-1
simgain = read.table(gzfile("processed_data/MOtree_apesim_gain.txt.1gain_1loss.022214_rep2.gz"))
hist(simgain[, ], 20, main = "Raw rate simulated gene gain", xlab = "# gains / gene")
plot of chunk unnamed-chunk-1
rm(simpres, simgain)
# show 'spread' of each null distribution
source("code/figure_scripts/calc_null_intervals_073014.R")
## [1] "gaining"
## [1] 98493
## [1] "presing"
## [1] 98493
## [1] 1249
## [1] 1249
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 3.0 6.0 7.1 8.0 48.0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 2.00 6.19 9.00 29.00
# THE NETWORK REWIRING ANALYSIS IS TOO BIG TO DO HERE. THE CODE FOR THESE
# IS code/intrapath2_shuffles_061414.R and code/netdist_shuffles_071614.R
# the metabolic network comparison can be done in principle just by
# uncommenting the following lines of code, though it would take maybe 30
# min to finish: note that these shuffling things are also stochastic, so
# the null distribution will vary a little bit from run to run. nonetheless,
# in each case, p should still be small or effectively 0.
# source('code/netdist_shuffles_071614.R')
# will just plot the results here: metabolic net overlap: this is computed
# in the script, just hard-coded here.
true = 78
intersect_size = read.table("processed_data/permdat_082415.txt")
p = length(which(intersect_size >= true))/length(intersect_size[, ])
hist(intersect_size[, ], 30, xlab = "Number of PGCEs", xlim = c(0, 85), main = paste("Pvalue = ",
p, sep = ""))
abline(v = true, lty = 2, lwd = 2)
plot of chunk unnamed-chunk-1
# within-pathway edge counting
permreps = dir("processed_data/intrapath_shuffles/", pattern = "sim_counts")
perms = c()
for (rep in permreps) {
perm = read.table(file.path("processed_data/intrapath_shuffles/", rep))
perms = rbind(perms, perm)
}
true = read.table("processed_data/intrapath_shuffles/true_counts_pathway_082415.txt")
p = length(which(perms[, 1] >= true))/nrow(perms)
# plot it
hist(perms[, 1], 20, xlim = c(0, 250), xlab = "Number of PGCEs", main = paste("P-value = ",
p))
abline(v = true[1, 1])
plot of chunk unnamed-chunk-1
# code/intrapath2_shuffles_061414.R is hard-coded to only run 10 shuffles at
# a time for parallelization. i run it in batch mode by means of the shell
# scriptshell_scripts/run_intrapath2_shuf.sh. code/netdist_shuffles_071614.R
# is hard-coded to run everything at once in a single job. the latter runs
# a bunch of other stats that we didn't end up using as well.
# PATHWAY-PATHWAY ASSOCIATIONS - these are also hard-coded for being run in
# batch mode via shell script shell_scripts/run_findOverPathHGT.sh.
rm(adjmat, koko, C_ij)
# RUBISCO small subunit analysis - first, write out genes from net with an
# edge to rbsS (K01602)
rbsS_genes = as.vector(links[links[, 2] == "K01602", 1])
# this file as written out contains the table S1 info.
write.table(rbsS_genes, "processed_data/reproduced_analysis/rbsS_genes.txt",
quote = FALSE, row.name = FALSE, col.name = FALSE)
# analysis of modules to get right granularity of functions - this output
# corresponds to table s2. this doesn't work very well on my system for
# whatever reason- so i am commenting it out so it is possible to see how it
# is done but it doesn't throw nasty errors. system('python
# code/kegg_enrich_new.py processed_data/reproduced_analysis/rbsS_genes.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/module.parsed120113
# processed_data/reproduced_analysis/rbsS_enrich.txt')
# make the plots of sample rbsS associations
source("code/figure_scripts/plottree_ape_MO_080515.R")
plot of chunk unnamed-chunk-1
## [1] 2
plot of chunk unnamed-chunk-1
## [1] 2
# TOPOLOGICAL SORT ANALYSIS again, REQUIRES NETWORKX!! if networkx library
# were available, could uncomment the line below: system('python
# code/toposort_grouping.py processed_data/dag_hgt_net_list.cyto >
# hgt_net_genes_toposorted.txt') in lieu of being able to run that script, i
# will just use a pre-computed dataset to make the figures etc.: the
# toposort, properly processed, can be used to get the file
# processed_data/hgt_toposort_attr_082315.txt, which just maps the genes
# (KEGG KOs) to ranks.
system("python code/toposort_to_rankattr.py processed_data/toposort_090915.txt > processed_data/reproduced_analysis/hgt_toposort_attr_090915.txt")
# a lot of warnings will now print- they are just associated with drawing
# the arcs of the figure.
source("code/figure_scripts/topo_arcdiag_fig4a_072215.R")
## 1 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 2 8.927
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 3 8.777
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 4 9.023
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 1 5 6.965
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 3 6.324
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 4 6.168
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 2 5 3.584
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 3 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 4 5.318
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 3 5 2.708
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 3 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 4 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 4 5 2.89
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 1 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 2 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 3 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 4 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
## 5 5 -Inf
## Warning: an argument will be fractionally recycled
## Warning: an argument will be fractionally recycled
plot of chunk unnamed-chunk-1
## [1] "this is table s3:"
## Rank Number of genes Total out-degree Total in-degree
## [1,] 1 1593 7792 0
## [2,] 2 498 357 2512
## [3,] 3 118 73 2348
## [4,] 4 46 6 2992
## [5,] 5 5 0 376
source("code/figure_scripts/manybee_topofigure.R")
source("code/figure_scripts/plot_alltopos_motree_100814.R")
plot of chunk unnamed-chunk-1
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
source("code/figure_scripts/count_dists_of_gains_fromroot_080515.R")
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
##
## Pearson's product-moment correlation
##
## data: depth[, 1] and (1 - depth[, 2])
## t = -11.17, df = 2208, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2705 -0.1915
## sample estimates:
## cor
## -0.2314
## Warning: Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: depth[, 1] and (1 - depth[, 2])
## S = 2.235e+09, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2423
## Warning: some notches went outside hinges ('box'): maybe set notch=FALSE
plot of chunk unnamed-chunk-1
## [1] "now plot barplot of same with 95% CIs- probably less informative."
plot of chunk unnamed-chunk-1
# 'ordering' is just a var created in the above that maps genes (KOs) to
# topological rank just hard-coding this for simplicity. a little
# duplicative. by restricting this to only annotations related to specific
# functions, and only to enrichments (i.e. not depletions), you get Table 1.
# nonetheless, there are some other interesting things here that we don't
# talk about really. again, the python script fails within this wrapper for
# unknown reasons but i am leaving the code commented.
write.table(rownames(ordering)[ordering[, ] == 1], "processed_data/reproduced_analysis/hgt_toporank1_082315.txt",
quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank1_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank1_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank1_082315_enrich')
write.table(rownames(ordering)[ordering[, ] == 2], "processed_data/reproduced_analysis/hgt_toporank2_082315.txt",
quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank2_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank2_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank2_082315_enrich')
write.table(rownames(ordering)[ordering[, ] == 3], "processed_data/reproduced_analysis/hgt_toporank3_082315.txt",
quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank3_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank3_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank3_082315_enrich')
write.table(rownames(ordering)[ordering[, ] == 4], "processed_data/reproduced_analysis/hgt_toporank4_082315.txt",
quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank4_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank4_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank4_082315_enrich')
write.table(rownames(ordering)[ordering[, ] == 5], "processed_data/reproduced_analysis/hgt_toporank5_082315.txt",
quote = FALSE, col.name = FALSE, row.name = FALSE)
# system('python code/kegg_enrich_new.py
# processed_data/reproduced_analysis/hgt_toporank5_082315.txt
# processed_data/kos_in_hgtnet_082415.txt
# processed_data/annots_helperfiles/brite.reparsed.083115
# processed_data/reproduced_analysis/hgt_toporank5_082315_enrich')
# system('cat
# processed_data/reproduced_analysis/hgt_toporank5_082315_enrich')
### PREDICTION these scripts will plot a bunch of the various figures and will
### also generate a lot of text output about the prediction quality gainLoss
### run files are provided in gainLoss_run_files/ and results in
### gainLoss_results/. All the code that I used for their post-processing is
### included but is not run in this script. the networks and gainLoss
### reconstructions for the training sets can be found as gzipped files
### included.
source("code/figure_scripts/hgt_predict_firm.R")
## [1] "ROC + precision/recall for firmicutes partition"
## number of edges: 3703
## number of predictable genes: 395
## number of gained genes: 3281
source("code/figure_scripts/hgt_predict_alpha.R")
## [1] "ROC for alpha/betaproteobacteria partition"
## number of edges: 1726
## number of predictable genes: 206
## number of gained genes: 3505
source("code/figure_scripts/hgt_predict_all.R")
## [1] "ROC for using whole dataset as predictor (overfit, probably)"
## number of edges: 8228
## number of predictable genes: 667
## number of gained genes: 3281
source("code/figure_scripts/prediction_vioplot.R")
## [1] "U-tests can be used to assess the significance of a predictive classifier (such as we used) in properly classifying data. Can be thought of as testing for the significance of the AUC, if it were a test statistic."
## [1] "test for significance of prediction for alpha/betaproteobacteria partition"
##
## Wilcoxon rank sum test with continuity correction
##
## data: as.numeric(alpha_for_roc[alpha_for_roc[, 2] == 0, 1]) and as.numeric(alpha_for_roc[alpha_for_roc[, 2] == 1, 1])
## W = 4848668, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "test for significance of prediction for firmicutes partition"
##
## Wilcoxon rank sum test with continuity correction
##
## data: as.numeric(firm_for_roc[firm_for_roc[, 2] == 0, 1]) and as.numeric(firm_for_roc[firm_for_roc[, 2] == 1, 1])
## W = 16943431, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "test for significance of prediction for unpartitioned dataset (overfit), predicting gains in firmicutes"
##
## Wilcoxon rank sum test with continuity correction
##
## data: as.numeric(all_for_roc[all_for_roc[, 2] == 0, 1]) and as.numeric(all_for_roc[all_for_roc[, 2] == 1, 1])
## W = 37225120, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "generates figure from supp (fig s8)"
## [1] "y axis is prediction score. first panel is firmicutes, second is alpha/beta"