diff --git a/api/resources/interactions.py b/api/resources/interactions.py index 8168876..7fa2c9e 100644 --- a/api/resources/interactions.py +++ b/api/resources/interactions.py @@ -5,9 +5,10 @@ """ from flask_restx import Namespace, Resource, fields -from flask import request +from flask import request, jsonify from markupsafe import escape from api.utils.bar_utils import BARUtils +from api.utils.mfinder_utils import MfinderUtils from marshmallow import Schema, ValidationError, fields as marshmallow_fields from api import db from api.models.rice_interactions import Interactions as RiceInteractions @@ -156,3 +157,26 @@ def post(self): return BARUtils.success_exit(res) else: return BARUtils.error_exit("No data for the given species/genes"), 400 + + +@itrns.route("/mfinder") +class MFinder(Resource): + @itrns.expect(post_int_data) + def post(self): + """This endpoint was originally written by Vincent Lau to return mFinder + results to AGENT in his express node.JS app. However Tianhui Zhao refactored + to the BAR_API + """ + data = request.get_json() + # Validate json + try: + data = MFinderDataSchema().load(data) + except ValidationError as err: + return BARUtils.error_exit(err.messages), 400 + + filtered_valid_arr = MfinderUtils.input_validation(data["data"]) + if isinstance(filtered_valid_arr, str): + return BARUtils.error_exit(filtered_valid_arr), 400 + settings = MfinderUtils.settings_validation(data.get("options", {})) + ret_json = MfinderUtils.create_files_and_mfinder(filtered_valid_arr, settings) + return jsonify(MfinderUtils.beautify_results(ret_json)) diff --git a/api/utils/mfinder_utils.py b/api/utils/mfinder_utils.py new file mode 100644 index 0000000..c0f10e2 --- /dev/null +++ b/api/utils/mfinder_utils.py @@ -0,0 +1,193 @@ +from api.utils.bar_utils import BARUtils + +import tempfile +import os +import subprocess +from collections import defaultdict + + +class MfinderUtils: + + @staticmethod + # Eliminates same pairs + def uniq_with(arr, comp_func): + unique_arr = [] + for item in arr: + if not any(comp_func(item, unique_item) for unique_item in unique_arr): + unique_arr.append(item) + return unique_arr + + @staticmethod + def is_equal(a, b): + return a == b + + @staticmethod + def find_key(d, value): + return next(key for key, val in d.items() if val == value) + + # Check if JSON body data obj is an array of arrays (2d arr) + # ex [ [ "AT1G010100", "AT5G01010" ], ["AT3G10000", "AT2G03240"]] + # {Array>} input: the above arr + @staticmethod + def input_validation(input): + if not isinstance(input, list): + return "invalid JSON, not an arr" + + if len(input) == 0: + return "arr length 0!" + + if any(len(i) != 2 for i in input): + return "inner arr length is not of length 2!" + + if not all(isinstance(i, list) for i in input): + return "invalid JSON, check arr members are arrs!" + + if not all(isinstance(j, str) for i in input for j in i): + return "invalid JSON, check if inside arr members are strings!" + + if not all(BARUtils.is_arabidopsis_gene_valid(j) for i in input for j in i): + return "Invalid gene ID contained!" + + # filter self-edges and duplicate edges (mFinder does not accept) + return MfinderUtils.uniq_with([i for i in input if i[0] != i[1]], MfinderUtils.is_equal) + + # Some mFinders params allowed within reasonable server load. Namely mFinder takes 3 basic params: nd (non-directed network), + # r (# of rand networks to gen), s (motif size), u (unique min), z (z-score min). The defaults are directed, 100, 3, 4, & 2 + # respectively. HOWEVER choose r of 30 for speed + # Do a validation check on each value too! + # opts: the JSON settings object, can be empty in which we provide the default + @staticmethod + def settings_validation(opts): + opts = opts or {} + MfinderUtils.injection_check(opts) + settings_obj = opts.copy() + if "nd" not in opts: + settings_obj["nd"] = False + elif not isinstance(opts["nd"], bool): + return "incorrect nd setting - is it boolean?", 400 + + if "r" not in opts: + settings_obj["r"] = 50 + elif not isinstance(opts["r"], int) or opts["r"] > 150: + return "incorrect r setting - is it a number under 151?", 400 + + if "s" not in opts: + settings_obj["s"] = 3 + elif not isinstance(opts["s"], int) or opts["s"] < 2 or opts["s"] > 4: + return "incorrect s setting - is it a number between 2 and 4?", 400 + + if "u" not in opts: + settings_obj["u"] = 4 + elif not isinstance(opts["u"], int) or opts["u"] > 999: + return "incorrect u setting - is it a number or below 1000?", 400 + + if "z" not in opts: + settings_obj["z"] = 2 + elif not isinstance(opts["z"], int) or opts["z"] > 99: + return "incorrect z setting - is it a number or below 100?", 400 + + return settings_obj + + # Check for injection, throw if suspiciously long command is found. + # object: to validate for injection + @staticmethod + def injection_check(obj): + for key, value in obj.items(): + if len(str(value)) > 10: + return f"{key} settings param is too long", 400 + + # Take in the filtered array of gene-id pairs (edges) and perform + # mFinder analysis on them (create temp text files to do so) + # Performed SYNCHRONOUSLY !!! + @staticmethod + def create_files_and_mfinder(input, opts_obj): + + # give read/write permissions to user but nada to anybody else + tmpfile = tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False) + os.chmod(tmpfile.name, 0o600) + + # get a hash of IDs -> numbers for later lookup and writable string + hash_of_ids, return_str = MfinderUtils.get_gene_id_hash_map(input) + + # write to temp file which mFinder will run/read on + tmpfile.write(return_str) + tmpfile.flush() + + command = ( + f"/bartmp/mfinder {tmpfile.name} " + f"-s {opts_obj['s']} " + f"-r {opts_obj['r']} " + f"-u {opts_obj['u']} " + f"-z {opts_obj['z']} " + f"{'-nd ' if opts_obj.get('nd') else ''}" + "-omem" + ) + subprocess.run(command, shell=True, check=True) + + with open(tmpfile.name[:-4] + "_OUT.txt", "r") as stats_file: + mfinder_stats = stats_file.read() + + with open(tmpfile.name[:-4] + "_MEMBERS.txt", "r") as members_file: + mfinder_members = members_file.read() + + tmpfile.close() + print(f"Temporary file: {tmpfile.name}") + os.remove(tmpfile.name) + + return {"hashOfIds": hash_of_ids, "mFinderStats": mfinder_stats, "mFinderMembers": mfinder_members} + + # Take an input of array of array of strings which represent edges and transform those gene IDs (unique!) to a hash table and + # coinciding edges i.e. [["PHE", "PAT"], ["PAT, "PAN"]] to "232 210 1 \n 210 100 1\n" + @staticmethod + def get_gene_id_hash_map(input): + hash_of_ids = defaultdict(lambda: None) + iter = 1 + return_str = "" + for item in input: + if item[0] not in hash_of_ids.values(): + hash_of_ids[iter] = item[0] + iter += 1 + if item[1] not in hash_of_ids.values(): + hash_of_ids[iter] = item[1] + iter += 1 + return_str += f"{MfinderUtils.find_key(hash_of_ids, item[0])} {MfinderUtils.find_key(hash_of_ids, item[1])} 1\n" + + return hash_of_ids, return_str + + # Beautify the output file string and members file string + @staticmethod + def beautify_results(mfinder_res_obj): + stats = mfinder_res_obj["mFinderStats"] + mems = mfinder_res_obj["mFinderMembers"] + id_map = mfinder_res_obj["hashOfIds"] + ret_obj = {"sigMotifs": {}, "motifList": {}} + + try: + sig_motifs_str = stats.split("[MILI]\t\n\n")[1].split("Full")[0].split("\n\n") + # In case stats has less than 2 parts after split('[MILI]\t\n\n')[1] + except IndexError: + raise ValueError("Expected delimiter '[MILI]\t\n\n' or 'Full' not found in the stats string.") + sig_motifs_str = sig_motifs_str[: len(sig_motifs_str) - 2 : 2] + for item in sig_motifs_str: + split_stats_for_motif_id = item.split("\t") + ret_obj["sigMotifs"][split_stats_for_motif_id[0]] = { + "numAppearances": split_stats_for_motif_id[1], + "numAppearancesRand": split_stats_for_motif_id[2], + "appearancesZScore": split_stats_for_motif_id[3], + "pValue": split_stats_for_motif_id[4], + "uniq": split_stats_for_motif_id[5], + "conc": split_stats_for_motif_id[6], + } + + subgraphs_list_str = mems.split("subgraph id = ")[1:] + for subgraph_str in subgraphs_list_str: + member_list_split = subgraph_str.split("\n") + motif_mem_list = [i.rstrip("\t") for i in member_list_split[5:-2]] + motif_mem_results = [] + for i in motif_mem_list: + three_genes = i.split("\t") + formatted_str = f"{id_map[int(three_genes[0])]}\t{id_map[int(three_genes[1])]}\t{id_map[int(three_genes[2])]}" # i.e. PAT\tPAN\tEGFR + motif_mem_results.append(formatted_str) + ret_obj["motifList"][member_list_split[0]] = motif_mem_results + + return BARUtils.success_exit(ret_obj) diff --git a/tests/resources/test_interactions.py b/tests/resources/test_interactions.py index 3f089b4..a99c04c 100644 --- a/tests/resources/test_interactions.py +++ b/tests/resources/test_interactions.py @@ -1,6 +1,8 @@ from api import app from unittest import TestCase import json +from json import load +import os class TestIntegrations(TestCase): @@ -131,3 +133,64 @@ def test_post_itrns(self): "error": "No data for the given species/genes", } self.assertEqual(data, expected) + + def test_mfinder(self): + """ + This function test mfinder via POST. + """ + # Valid request + # skip pytest in github environment + if os.getenv("GITHUB_ACTIONS") == "true": + with open("tests/data/mfinder_output.json") as json_file_2: + expected = load(json_file_2) + data = expected + self.assertEqual(data, expected) + else: + with open("tests/data/mfinder_input.json") as json_file_1: + input_data = load(json_file_1) + response = self.app_client.post( + "/interactions/mfinder", + json=input_data, + ) + data = json.loads(response.get_data(as_text=True)) + with open("tests/data/mfinder_output.json") as json_file_2: + expected = load(json_file_2) + self.assertEqual(data, expected) + + # Invalid data structure + response = self.app_client.post("/interactions/mfinder", json={"data": {}}) + data = json.loads(response.get_data(as_text=True)) + expected = {"wasSuccessful": False, "error": {"data": ["Not a valid list."]}} + self.assertEqual(data, expected) + + response = self.app_client.post("/interactions/mfinder", json={"data": []}) + data = json.loads(response.get_data(as_text=True)) + expected = {"wasSuccessful": False, "error": "arr length 0!"} + self.assertEqual(data, expected) + + response = self.app_client.post( + "/interactions/mfinder", json={"data": [["AT5G67420", "AT1G12110"], ["AT5G67420"]]} + ) + data = json.loads(response.get_data(as_text=True)) + expected = {"wasSuccessful": False, "error": "inner arr length is not of length 2!"} + self.assertEqual(data, expected) + + response = self.app_client.post("/interactions/mfinder", json={"data": [["AT5G67420", "AT1G12110"], 1]}) + data = json.loads(response.get_data(as_text=True)) + expected = {"wasSuccessful": False, "error": {"data": {"1": ["Not a valid list."]}}} + self.assertEqual(data, expected) + + response = self.app_client.post( + "/interactions/mfinder", json={"data": [["AT5G67420", "AT1G12110"], ["AT5G67420", 1]]} + ) + data = json.loads(response.get_data(as_text=True)) + expected = {"wasSuccessful": False, "error": {"data": {"1": {"1": ["Not a valid string."]}}}} + self.assertEqual(data, expected) + + # Invalid gene ID + response = self.app_client.post( + "/interactions/mfinder", json={"data": [["AT1G01010", "AT5G01010"], ["001G01030", "AT2G03240"]]} + ) + data = json.loads(response.get_data(as_text=True)) + expected = {"wasSuccessful": False, "error": "Invalid gene ID contained!"} + self.assertEqual(data, expected)