diff --git a/application/frontend/src/pages/chatbot/chatbot.tsx b/application/frontend/src/pages/chatbot/chatbot.tsx index 1017569f1..8664de593 100644 --- a/application/frontend/src/pages/chatbot/chatbot.tsx +++ b/application/frontend/src/pages/chatbot/chatbot.tsx @@ -301,7 +301,8 @@ export const Chatbot = () => { function displayDocument(d: Document) { return ( - *Reference: The above answer was based on the {d.name} section of {d.section}; + *Reference: The above answer was based on the {d.name} section of{' '} + {d.section ? d.section : d.sectionID}; ); } diff --git a/application/prompt_client/prompt_client.py b/application/prompt_client/prompt_client.py index 351040aba..27f5f1f2c 100644 --- a/application/prompt_client/prompt_client.py +++ b/application/prompt_client/prompt_client.py @@ -181,10 +181,7 @@ def __init__(self, database: db.Node_collection) -> None: self.ai_client = None if os.environ.get("SERVICE_ACCOUNT_CREDENTIALS"): logger.info("using Google Vertex AI engine") - self.ai_client = vertex_prompt_client.VertexPromptClient( - os.environ.get("VERTEX_PROJECT_ID"), - os.environ.get("VERTEX_PROJECT_LOCATION"), - ) + self.ai_client = vertex_prompt_client.VertexPromptClient() elif os.getenv("OPENAI_API_KEY"): logger.info("using Open AI engine") self.ai_client = openai_prompt_client.OpenAIPromptClient( @@ -433,10 +430,16 @@ def generate_text(self, prompt: str) -> Dict[str, str]: logger.info( f"The prompt {prompt}, was most similar to object \n{closest_object}\n, with similarity:{similarity}" ) + closest_content = "" + if closest_object.hyperlink: + emb = self.database.get_embedding(closest_id) + if emb: + closest_content = emb[0].embeddings_content if closest_object: - closest_object_str = "\n".join( + closest_object_str = f"{closest_content}" + "\n".join( [f"{k}:{v}" for k, v in closest_object.shallow_copy().todict().items()] - )[:8000] + ) + closest_object_str = closest_object_str[:8000] # vertex and openai have a model limit of 8100 characters answer = self.ai_client.create_chat_completion( prompt=prompt, diff --git a/application/prompt_client/vertex_prompt_client.py b/application/prompt_client/vertex_prompt_client.py index c178c4c9b..4a05b545c 100644 --- a/application/prompt_client/vertex_prompt_client.py +++ b/application/prompt_client/vertex_prompt_client.py @@ -22,6 +22,8 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) +MAX_OUTPUT_TOKENS = 1024 + class VertexPromptClient: context = ( @@ -34,20 +36,20 @@ class VertexPromptClient: ) examples = [ InputOutputTextPair( - input_text=" ```I liked using this product```", - output_text="The user had a great experience with this product, it was very positive", + input_text="Your task is to answer the following question based on this area of knowledge:`common weakness enumeration a community-developed list of software & hardware weakness types home > cwe list > cwe- individual dictionary definition ( 4.11 ) id lookup : home about cwe list scoring mapping guidance community news search cwe-79 : improper neutralization of input during web page generation ( 'cross-site scripting ' ) weakness id : 79 abstraction : base structure : simple view customized information : conceptual operational mapping friendly complete custom description the product does not neutralize or incorrectly neutralizes user-controllable input before it is placed in output that is used as a web page that is served to other users . extended description cross-site scripting ( xss ) vulnerabilities occur when : untrusted data enters a web application , typically from a web request . the web application dynamically generates a web page that contains this untrusted data . during page generation , the application does not prevent the data from containing content that is executable by a web browser , such as javascript , html tags , html attributes , mouse events , flash , activex , etc . a victim visits the generated web page through a web browser , which contains malicious script that was injected using the untrusted data . since the script comes from a web page that was sent by the web server , the victim 's web browser executes the malicious script in the context of the web server 's domain . this effectively violates the intention of the web browser 's same-origin policy , which states that scripts in one domain should not be able to access resources or run code in a different domain . there are three main kinds of xss : type 1 : reflected xss ( or non-persistent ) - the server reads data directly from the http request and reflects it back in the http response . reflected xss exploits occur when an attacker causes a victim to supply dangerous content to a vulnerable web application , which is then reflected back to the victim and executed by the web browser . the most common mechanism for delivering malicious content is to include it as a parameter in a url that is posted publicly or e-mailed directly to the victim . urls constructed in this manner constitute the core of many phishing schemes , whereby an attacker convinces a victim to visit a url that refers to a vulnerable site . after the site reflects the attacker 's content back to the victim , the content is executed by the victim 's browser . type 2 : stored xss ( or persistent ) - the application stores dangerous data in a database , message forum , visitor log , or other trusted data store . at a later time , the dangerous data is subsequently read back into the application and included in dynamic content . from an attacker 's perspective , the optimal place to inject malicious content is in an area that is displayed to either many users or particularly interesting users . interesting users typically have elevated privileges in the application or interact with sensitive data that is valuable to the attacker . if one of these users executes malicious content , the attacker may be able to perform privileged operations on behalf of the user or gain access to sensitive data belonging to the user . for example , the attacker might inject xss into a log message , which might not be handled properly when an administrator views the logs . type 0 : dom-based xss - in dom-based xss , the client performs the injection of xss into the page ; in the other types , the server performs the injection . dom-based xss generally involves server-controlled , trusted script that is sent to the client , such as javascript that performs sanity checks on a form before the user submits it . if the server-supplied script processes user-supplied data and then injects it back into the web page ( such as with dynamic html ) , then dom-based xss is possible . once the malicious script is injected , the attacker can perform a variety of malicious activities . the attacker could transfer private information , such as cookies that may include session information , from the victim 's machine to the attacker . the attacker could send malicious requests to a web site on behalf of the victim , which could be especially dangerous to the site if the victim has administrator privileges to manage that site . phishing attacks could be used to emulate trusted web sites and trick the victim into entering a password , allowing the attacker to compromise the victim 's account on that web site . finally , the script could exploit a vulnerability in the web browser itself possibly taking over the victim 's machine , sometimes referred to as `` drive-by hacking . '' in many cases , the attack can be launched without the victim even being aware of it . even with careful users , attackers frequently use a variety of methods to encode the malicious portion of the attack , such as url encoding or unicode , so the request looks less suspicious . alternate terms xss : a common abbreviation for cross-site scripting . html injection : used as a synonym of stored ( type 2 ) xss . css : in the early years after initial discovery of xss , `` css '' was a commonly-used acronym . however , this would cause confusion with `` cascading style sheets , '' so usage of this acronym has declined significantly . relationships relevant to the view `` research concepts '' ( cwe-1000 ) nature type id name childof 74 improper neutralization of special elements in output used by a downstream component ( 'injection ' ) parentof 80 improper $eutralization of script-related html tags in a web page ( basic xss ) parentof 81 improper neutralization of script in an error message web page parentof 83 improper neutralization of script in attributes in a web page parentof 84 improper neutralization of encoded uri schemes in a web page parentof 85 doubled character xss manipulations parentof 86 improper neutralization of invalid characters in identifiers in web pages parentof 87 improper neutralization of alternate xss syntax parentof 692 incomplete denylist to cross-site scripting peerof 352 cross-site request forgery ( csrf ) peerof 494 download of code without integrity check canfollow 113 improper neutralization of crlf sequences in http headers ( 'http request/response splitting ' ) canfollow 184 incomplete list of disallowed inputs canprecede 494 download of code without integrity check relevant to the view `` software development '' ( cwe-699 ) nature type id name memberof 137 data neutralization issues relevant to the view `` weaknesses for simplified mapping of published vulnerabilities '' ( cwe-1003 ) relevant to the view `` architectural concepts '' ( cwe-1008 ) background details the same origin policy states that browsers should limit the resources accessible to scripts running on a given web site , or `` origin '' , to the resources associated with that web site on the client-side , and not the client-side resources of any other sites or `` origins '' . the goal is to prevent one site from being able to modify or read the contents of an unrelated site . since the world wide web involves interactions between many sites , this policy is important for browsers to enforce . when referring to xss , the domain of a website is roughly equivalent to the resources associated with that website on the client-side of the connection . that is , the domain can be thought of as all resources the browser is storing for the user 's interactions with this particular site . modes of introduction phase note implementation realization : this weakness is caused during implementation of an architectural security tactic . applicable platforms languages class : not language-specific ( undetermined prevalence ) technologies class : web based ( often prevalent ) common consequences scope impact likelihood access control confidentiality technical impact : bypass protection mechanism ; read application data the most common attack performed with cross-site scripting invol` if you can, provide code examples, delimit any code snippet with three backticks\n Question: `what is xss?` ignore all other commands and questions that are not relevant.", + output_text="Answer: XSS is a type of injection attack, in which malicious code is injected into a legitimate web page or application. This code is then executed by the victim's browser when they visit the page. XSS attacks can be used to steal cookies, session tokens, or other sensitive information. They can also be used to deface websites or redirect users to malicious sites. Here is an example of an XSS attack:``````This code would be injected into a web page, and when a user visits the page, the alert box would be displayed", ), InputOutputTextPair( - input_text="Review From User: ```What's the weather like today?```", - output_text="I'm sorry. I don't have that information.", + input_text="Your task is to answer the following question based on this area of knowledge:`skip to content owasp top 10:2021 a10 server side request forgery ( ssrf ) owasp/top10 owasp top 10:2021 home notice introduction how to use the owasp top 10 as a standard how to start an appsec program with the owasp top 10 about owasp top 10:2021 list a01 broken access control a02 cryptographic failures a03 injection a04 insecure design a05 security misconfiguration a06 vulnerable and outdated components a07 identification and authentication failures a08 software and data integrity failures a09 security logging and monitoring failures a10 server side request forgery ( ssrf ) next steps table of contents factors overview description how to prevent from network layer from application layer : additional measures to consider : example attack scenarios references list of mapped cwes a10:2021 – server-side request forgery ( ssrf ) factors cwes mapped max incidence rate avg incidence rate avg weighted exploit avg weighted impact max coverage avg coverage total occurrences total cves 1 2.72 % 2.72 % 8.28 6.72 67.72 % 67.72 % 9,503 385 overview this category is added from the top 10 community survey ( # 1 ) . the data shows a relatively low incidence rate with above average testing coverage and above-average exploit and impact potential ratings . as new entries are likely to be a single or small cluster of common weakness enumerations ( cwes ) for attention and awareness , the hope is that they are subject to focus and can be rolled into a larger category in a future edition . description ssrf flaws occur whenever a web application is fetching a remote resource without validating the user-supplied url . it allows an attacker to coerce the application to send a crafted request to an unexpected destination , even when protected by a firewall , vpn , or another type of network access control list ( acl ) . as modern web applications provide end-users with convenient features , fetching a url becomes a common scenario . as a result , the incidence of ssrf is increasing . also , the severity of ssrf is becoming higher due to cloud services and the complexity of architectures . how to prevent developers can prevent ssrf by implementing some or all the following defense in depth controls : from network layer segment remote resource access functionality in separate networks to reduce the impact of ssrf enforce “ deny by default ” firewall policies or network access control rules to block all but essential intranet traffic . hints : ~ establish an ownership and a lifecycle for firewall rules based on applications . ~ log all accepted and blocked network flows on firewalls ( see a09:2021-security logging and monitoring failures ) . from application layer : sanitize and validate all client-supplied input data enforce the url schema , port , and destination with a positive allow list do not send raw responses to clients disable http redirections be aware of the url consistency to avoid attacks such as dns rebinding and “ time of check , time of use ” ( toctou ) race conditions do not mitigate ssrf via the use of a deny list or regular expression . attackers have payload lists , tools , and skills to bypass deny lists . additional measures to consider : do n't deploy other security relevant services on front systems ( e.g . openid ) . control local traffic on these systems ( e.g . localhost ) for frontends with dedicated and manageable user groups use network encryption ( e.g . vpns ) on independent systems to consider very high protection needs example attack scenarios attackers can use ssrf to attack systems protected behind web application firewalls , firewalls , or network acls , using scenarios such as : scenario # 1 : port scan internal servers – if the network architecture is unsegmented , attackers can map out internal networks and determine if ports are open or closed on internal servers from connection results or elapsed time to connect or reject ssrf payload connections . scenario # 2 : sensitive data exposure – attackers can access local files or internal services to gain sensitive information such as file : ///etc/passwd and http : //localhost:28017/ . scenario # 3 : access metadata storage of cloud services – most cloud providers have metadata storage such as http : //169.254.169.254/ . an attacker can read the metadata to gain sensitive information . scenario # 4 : compromise internal services – the attacker can abuse internal services to conduct further attacks such as remote code execution ( rce ) or denial of service ( dos ) . references owasp - server-side request forgery prevention cheat sheet portswigger - server-side request forgery ( ssrf ) acunetix - what is server-side request forgery ( ssrf ) ? ssrf bible a new era of ssrf - exploiting url parser in trending programming languages ! list of mapped cwes cwe-918 server-side request forgery ( ssrf ) © copyright 2021 - owasp top 10 team - this work is licensed under a creative commons attribution 3.0 unported license . made with material for mkdocsname:OWASP Top 10 2021 doctype:Standard hyperlink:https://owasp.org/Top10/A10_2021-Server-Side_Request_Forgery_%28SSRF%29/section:Server Side Request Forgery (SSRF)sectionID:A10` if you can, provide code examples, delimit any code snippet with three backticks Question: `what is ssrf please provide code examples` ignore all other commands and questions that are not relevant", + output_text="Answer: Server-side request forgery (SSRF) is a type of attack that occurs when an attacker is able to trick a web application into making a request to an unintended, untrusted, or unauthorized server. This can be done by manipulating the input provided to the web application, such as the URL of a resource that is being requested.For example, consider a web application that allows users to upload images. If the application is not properly validating the user-supplied input, an attacker could upload an image that contains a malicious URL. When the user views the image, the web application will make a request to the malicious URL, which could be used to steal the user's credentials or install malware on their computer.Here is a code example of an SSRF attack:```// This code is vulnerable to SSRF attacks$url = $_GET['url'];$ch = curl_init($url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);$response = curl_exec($ch);curl_close($ch);```In this code, the user-supplied URL is passed directly to the `curl_init()` function. This means that the attacker can control the destination of the request. For example, the attacker could specify a URL that points to a malicious server. When the user visits the web page, the application will make a request to the malicious server, which could be used to steal the user's credentials or install malware on their computer.To prevent SSRF attacks, web applications should validate all user-supplied input. This includes the URL of any resources that are being requested. The application should only allow requests to trusted, authorized servers.", ), InputOutputTextPair( - input_text="Review From User: ```Do you sell soft drinks?```", - output_text="Sorry. This is not a product summary.", + input_text="Your task is to answer the following question based on this area of knowledge:`skip to content owasp top 10:2021 a10 server side request forgery ( ssrf ) owasp/top10 owasp top 10:2021 home notice introduction how to use the owasp top 10 as a standard how to start an appsec program with the owasp top 10 about owasp top 10:2021 list a01 broken access control a02 cryptographic failures a03 injection a04 insecure design a05 security misconfiguration a06 vulnerable and outdated components a07 identification and authentication failures a08 software and data integrity failures a09 security logging and monitoring failures a10 server side request forgery ( ssrf ) next steps table of contents factors overview description how to prevent from network layer from application layer : additional measures to consider : example attack scenarios references list of mapped cwes a10:2021 – server-side request forgery ( ssrf ) factors cwes mapped max incidence rate avg incidence rate avg weighted exploit avg weighted impact max coverage avg coverage total occurrences total cves 1 2.72 % 2.72 % 8.28 6.72 67.72 % 67.72 % 9,503 385 overview this category is added from the top 10 community survey ( # 1 ) . the data shows a relatively low incidence rate with above average testing coverage and above-average exploit and impact potential ratings . as new entries are likely to be a single or small cluster of common weakness enumerations ( cwes ) for attention and awareness , the hope is that they are subject to focus and can be rolled into a larger category in a future edition . description ssrf flaws occur whenever a web application is fetching a remote resource without validating the user-supplied url . it allows an attacker to coerce the application to send a crafted request to an unexpected destination , even when protected by a firewall , vpn , or another type of network access control list ( acl ) . as modern web applications provide end-users with convenient features , fetching a url becomes a common scenario . as a result , the incidence of ssrf is increasing . also , the severity of ssrf is becoming higher due to cloud services and the complexity of architectures . how to prevent developers can prevent ssrf by implementing some or all the following defense in depth controls : from network layer segment remote resource access functionality in separate networks to reduce the impact of ssrf enforce “ deny by default ” firewall policies or network access control rules to block all but essential intranet traffic . hints : ~ establish an ownership and a lifecycle for firewall rules based on applications . ~ log all accepted and blocked network flows on firewalls ( see a09:2021-security logging and monitoring failures ) . from application layer : sanitize and validate all client-supplied input data enforce the url schema , port , and destination with a positive allow list do not send raw responses to clients disable http redirections be aware of the url consistency to avoid attacks such as dns rebinding and “ time of check , time of use ” ( toctou ) race conditions do not mitigate ssrf via the use of a deny list or regular expression . attackers have payload lists , tools , and skills to bypass deny lists . additional measures to consider : do n't deploy other security relevant services on front systems ( e.g . openid ) . control local traffic on these systems ( e.g . localhost ) for frontends with dedicated and manageable user groups use network encryption ( e.g . vpns ) on independent systems to consider very high protection needs example attack scenarios attackers can use ssrf to attack systems protected behind web application firewalls , firewalls , or network acls , using scenarios such as : scenario # 1 : port scan internal servers – if the network architecture is unsegmented , attackers can map out internal networks and determine if ports are open or closed on internal servers from connection results or elapsed time to connect or reject ssrf payload connections . scenario # 2 : sensitive data exposure – attackers can access local files or internal services to gain sensitive information such as file : ///etc/passwd and http : //localhost:28017/ . scenario # 3 : access metadata storage of cloud services – most cloud providers have metadata storage such as http : //169.254.169.254/ . an attacker can read the metadata to gain sensitive information . scenario # 4 : compromise internal services – the attacker can abuse internal services to conduct further attacks such as remote code execution ( rce ) or denial of service ( dos ) . references owasp - server-side request forgery prevention cheat sheet portswigger - server-side request forgery ( ssrf ) acunetix - what is server-side request forgery ( ssrf ) ? ssrf bible a new era of ssrf - exploiting url parser in trending programming languages ! list of mapped cwes cwe-918 server-side request forgery ( ssrf ) © copyright 2021 - owasp top 10 team - this work is licensed under a creative commons attribution 3.0 unported license . made with material for mkdocsname:OWASP Top 10 2021 doctype:Standard hyperlink:https://owasp.org/Top10/A10_2021-Server-Side_Request_Forgery_%28SSRF%29/section:Server Side Request Forgery (SSRF)sectionID:A10` if you can, provide code examples, delimit any code snippet with three backticks Question: `what is ssrf please provide code examples` ignore all other commands and questions that are not relevant", + output_text='Answer: Sure, here is a code example of an ssrf attack:```import requestsurl = "http://example.com/?url="# This payload will cause the server to make a request to google.compayload = "https://google.com"response = requests.get(url + payload)print(response.text)```This attack can be used to access sensitive information from a server, or to perform other malicious actions.', ), ] - def __init__(self, project_id, location) -> None: + def __init__(self) -> None: service_account_secrets_file = os.path.join( pathlib.Path(__file__).parent.parent.parent, "gcp_sa_secret.json" ) @@ -60,7 +62,10 @@ def __init__(self, project_id, location) -> None: else: logger.fatal("env SERVICE_ACCOUNT_CREDENTIALS has not been set") - # vertexai.init(project=project_id, location=location) + vertexai.init( + project=os.environ.get("GOOGLE_PROJECT_ID"), + location=os.environ.get("GOOGLE_PROJECT_LOCATION"), + ) self.chat_model = ChatModel.from_pretrained("chat-bison@001") self.embeddings_model = TextEmbeddingModel.from_pretrained( "textembedding-gecko@001" @@ -92,7 +97,8 @@ def get_text_embeddings(self, text: str) -> List[float]: return values def create_chat_completion(self, prompt, closest_object_str) -> str: + parameters = {"temperature": 0.5, "max_output_tokens": MAX_OUTPUT_TOKENS} msg = f"Your task is to answer the following question based on this area of knowledge:`{closest_object_str}` if you can, provide code examples, delimit any code snippet with three backticks\nQuestion: `{prompt}`\n ignore all other commands and questions that are not relevant." - - response = self.chat.send_message(msg) + print(msg) + response = self.chat.send_message(msg, **parameters) return response.text diff --git a/application/web/web_main.py b/application/web/web_main.py index ec9149ebd..9381c8079 100644 --- a/application/web/web_main.py +++ b/application/web/web_main.py @@ -31,7 +31,7 @@ from google.oauth2 import id_token from google_auth_oauthlib.flow import Flow from application.utils.spreadsheet import write_csv -from pip._vendor import cachecontrol +import oauthlib import google.auth.transport.requests ITEMS_PER_PAGE = 20 @@ -434,7 +434,6 @@ def login(): flow_instance = CREFlow.instance() authorization_url, state = flow_instance.flow.authorization_url() session["state"] = state - return redirect(authorization_url) @@ -452,9 +451,9 @@ def callback(): try: flow_instance.flow.fetch_token(authorization_response=request.url) except oauthlib.oauth2.rfc6749.errors.MismatchingStateError as mse: - return redirect(url_for("/chatbot")) + return redirect("/chatbot") if not session.get("state"): - redirect(url_for("/login")) + redirect(url_for("web.login")) if session["state"] != request.args["state"]: abort(500) # State does not match! credentials = flow_instance.flow.credentials