diff --git a/SIAB/io/pseudopotential/kernel.py b/SIAB/io/pseudopotential/kernel.py index 6ecbe417..b228720b 100644 --- a/SIAB/io/pseudopotential/kernel.py +++ b/SIAB/io/pseudopotential/kernel.py @@ -4,26 +4,52 @@ def iter_tree(root: ET.Element): return {child.tag: {"attrib": child.attrib, "data": child.text} for child in list(root.iter())} def preprocess(fname: str): - """ADC pseudopotential has & symbol at the beginning of line, which is not allowed in xml, replace & with &""" + """ Pseudopotential XML file preprocess + + preprocess is relatively hard-coded. There are some cases the UPF file is not standard xml, therefore + some modifications are needed. Cases considered: + 1. ADC pseudopotential has & symbol at the beginning of line, which is not allowed in xml, replace `&` + with `&` + 2. GBRV pseudopotential does not startswith , and not endswith , add + to the beginning of the file and to the end of the file + 3. some of ADC pseudopotentials have `CDATA` tag, which is not allowed in xml, remove them + """ + import re, uuid, os + ftemp = f"{str(uuid.uuid3(uuid.NAMESPACE_DNS, fname))}.xml" + #print(f"Preprocessing {fname}, will write standard XML formatted temporaray file to {ftemp}") + # because a pseudopotential file would not be large, directly read all lines into memory with open(fname, "r") as f: lines = f.readlines() - """GBRV pseudopotential does not startswith , but , - add to the beginning of the file and to the end of the file""" - if not lines[0].startswith("\n") - lines.append("") - - with open(fname, "w") as f: - for line in lines: - """if line starts with &, replace & with &, - but if already &, do not replace""" - if line.strip().startswith("&") and not line.strip().startswith("&"): - line = line.replace("&", "&") - + # it is compulsory for all XML files to start with + if not lines[0].startswith("\n") + # from the last line, check if the first line startswith `<` is not . If not, add to the end of the file + i = -1 + while not lines[i].strip() and i > -len(lines): + i -= 1 + if not "" in lines[i]: + lines.append("\n") + # write the modified lines back to the file + # there are more than one tasks remain: + # for ADC pseudopotential, replace `&` with `&` + # for PseudoDojo v1.0 pseudopotential, carefully check consistency between opening and closing tags + # for ADC pseudopotential or ld1 generated, replace eliminate both `` if they exist + lines = [line.replace("", "") for line in lines] + with open(ftemp, "w") as f: + # replace the `&` symbol at the beginning of the line with `&`, this is done by xml_syntax_filter + for _, line in xml_syntax_filter(lines): + #for line in lines: + _match = re.match(r"^([\s]*)(\&[\w]+)([^;]*)(\s*)$", line) + line = f"{_match.group(1)}{_match.group(2).replace('&', '&')}{_match.group(3)}{_match.group(4)}\n"\ + if _match else line f.write(line) - - if line.strip() == "": + if "" in line: # there are some pseudopotential files endswith ppgen file, but will crash the xml parser break + return os.path.abspath(ftemp) import SIAB.io.pseudopotential.tools.basic as siptb def postprocess(parsed: dict): @@ -50,20 +76,22 @@ def postprocess(parsed: dict): def upf(fname: str): """parse the pseudopotential file, return a dictionary""" + import os error_msg = """ERROR: UPF file with non-XML format. Please contact with either developer of pseudopotential you use or the developer of this package. For the latter choice, you can submit issue in Github Repository at: https://github.com/kirk0830/abacus_orbital_generation , thanks for understanding, raise TypeError and Quit...""" - preprocess(fname) + ftemp = preprocess(fname) try: - tree = ET.parse(fname) + tree = ET.parse(ftemp) except ET.ParseError: print(error_msg, flush=True) raise TypeError("ERROR: Please read the error message above.") from None root = tree.getroot() parsed = iter_tree(root) postprocess(parsed) + os.remove(ftemp) return parsed def vwr(fname: str): @@ -191,6 +219,110 @@ def vwr(fname: str): } return out +def xml_syntax_filter(content: str|list[str]): + """it is found that some pseudopotential may have incorrect xml format, this function is to check the consistency of tags + and correct if possible. A two-member tuple is returned, the first element is a boolean indicating whether the nearest tag is closed, + the second element is the corrected line. + + ```python + with open("file.xml", "r") as f: + content = f.readlines() + for is_closed, line in xml_syntax_filter(content): + print(line) + ``` + """ + import re + content = content.split("\n") if isinstance(content, str) else content + # for there are tags like "", then it will be a single tag + # or + buf = "" + + # regular expression example + resg = r"<[^>]+/>" # <.../> + reop = r"<[^/][^>]+>" # <...> + recls = r"]+>" # + relicmp = r"<[^>]+" # <... + rericmp = r"[^>]+>" # ...> + recmt = r"" # + + # the stack to store the names of the tags, when a closing tag is found, it should be the same as the nearest opened tag + # otherwise, it is a mismatch error and correction is needed + + # there is an exception caused by the opening comment: 20240506 + inbuilt_name_stack = [] + for line in content: # loop over all lines + l = line.strip() # copy the value then do the strip operation + # XML comments can be add both in one line and across line. However, + # only in pslibrary 1.0.0 the across-line-comment is encountered... + # but it is not clever to add more complexity (I mean more regular + # expressions) to handle this case + if re.match(recmt, l): + yield True, line + # single tag case, with the form <.../>, directly yield it, remember + # to yield the original line, rather than the stripped one + elif re.match(resg, l): + yield True, line + # opening tag case, with the form <...>. If this form appears, it + # is possible that there are plenty of data belonging to this tag. + # To save the tag name into the stack, then yield the original line + elif re.match(reop, l): + name = re.match(r"<([^ >]+)", l).group(1) + inbuilt_name_stack.append(name) + yield False, line + # closing tag case, with the form , the end of the possibilty + # above. If this form appears, then it is the end of the tag, so pop + # the tag name from the stack, then yield the original line. However, + # for PseudoDojo v1.0 pseudopotential, there would be mismatch between + # the opening tag and the closing tag, so need to correct it + elif re.match(recls, l): + name = re.match(r"]+)", l).group(1) + yield (True, line) if inbuilt_name_stack[-1] == name \ + else (True, line.replace(name, inbuilt_name_stack[-1])) + inbuilt_name_stack.pop() + # the case that a "left tag incomplete", means the tag is not closed but like + # + # in this case, will use buffer to store all content until the tag is closed + # then yield-back the complete tag. + # however, must be sure the buffer is empty before the next tag + # EXCEPTION: read ": + assert buf != "", "buffer is empty" + buf = " ".join([buf, l]) + if re.match(resg, buf): + yield True, buf + "\n" + buf = "" + elif re.match(reop, buf): + name = re.match(r"<([^ >]+)", buf).group(1) + inbuilt_name_stack.append(name) + yield False, buf + "\n" + buf = "" + else: + raise ValueError(f"incorrect tag: {buf}") + # for not-tag case, might be data, so in this case, do not touch + # it. but it can also be within tag, say incomplete tag. + else: + # if buffer is not empty and the current line does not have any tag + # related information, then it would be the situation within the tag + # or out of any tags. Fortunately the tag will always start with its + # name, or say the symbol `<` always appear to be in the same line + # as the tag name, therefore once it is really within the tag, the + # buffer will at least has the content of tag name, rather than empty + if buf != "": + buf = " ".join([buf, l]) + continue + # if buffer is empty, then it is not within any tag, just yield the line + else: + yield True, line + import unittest class TestPspotKernel(unittest.TestCase): def test_vwr(self): diff --git a/SIAB/io/pseudopotential/tools/advanced.py b/SIAB/io/pseudopotential/tools/advanced.py index ae1a1b05..355440b2 100644 --- a/SIAB/io/pseudopotential/tools/advanced.py +++ b/SIAB/io/pseudopotential/tools/advanced.py @@ -3,80 +3,116 @@ def determine_type(parsed: dict): """pseudopotentials can be generated by not only one codes, to extract information from PP_INFO, PP_INPUTFILE, need to know the exact way how information is organized """ - error_msg = """ERROR: pseudopotential with not recognized type, this is due to limited support + error_msg = """ +################################################################################################## +! ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR +################################################################################################## + +Pseudopotential with not recognized type, this is due to only limited support implemented on orbital auto-generation feature. You can submit issue on Github Repository at: -https://github.com/kirk0830/abacus_orbital_generation, and for instantly use, you can follow the +https://github.com/kirk0830/abacus_orbital_generation. For instantly use, you can follow the following guidelines. Please check manually the pseudopotential file you provided and specify: 1. ALL `nbands` with specific number 2. ALL `nbands_ref` with specific number -3. ALL `orb_conf` and `orb_ref` with specific configuration like either `[0, 1, 2, 3]` or `0s1p2d3f` - -ADDITIONALLY! Please check your input, if you are using `optimizer` = `bfgs`, with -`spill_guess` as `atomic`, a monomer (single atom) calculation will be run to generate the -initial guess for orbital optimization. You should explicitly create one section in -`reference_systems` for monomer calculation (setting `shape` to be `monomer`), and specify the -`nbands` rationally (can be very high, but may also be dangerous because PW calculation cannot -calculate isolated atom electronic structure well, artifact brought about by PW will pollute the -energy level information). - +3. ALL `lmaxmax` with specific number +4. ALL `orb_conf` and `orb_ref` with specific configuration like either `[0, 1, 2, 3]` or `0s1p2d3f` +5. If you are using `optimizer` = `bfgs`, with `spill_guess` as `atomic`, a monomer (single atom) +calculation will be run to generate the initial guess for orbital optimization. You should explicitly +create one section in `reference_systems` for monomer calculation (setting `shape` to be `monomer`), +and specify the `nbands` rationally (can be very high, but may also be dangerous because PW +calculation cannot calculate isolated atom electronic structure well, artifact brought about by PW +will pollute the energy level information). + +Make sure in your input the following tags are correctly set: +{ + "element": [the element symbol], + + "reference_systems": [ + { + "shape": "monomer", + "nbands": NUMBER, + "nspin": NUMBER, + "lmaxmax": NUMBER + }, + { + "shape": "dimer", + "nbands": NUMBER, + "nspin": NUMBER, + "lmaxmax": NUMBER, + "bond_lengths": (can be "auto") + }, + ... + ], + "orbitals": [ + { + "zeta_notation": [S#]s[P#]p[D#]d[F#]f, + "nbands": NUMBER, + "orb_ref": "none" or [S#']s[P#']p[D#']d[F#']f, + "shape": (the same), + }, + ... + ] +} Then rerun the orbital generation. TypeError raised, Quit...""" - """ONCVPSP - ONCVPSP is the format of pseudopotential most seen in norm-conserving pseudopotential, - such as SG15, PD (developed by pwmat team?) and DOJO""" - if "ONCVPSP" in parsed["PP_INFO"]["data"]: - return "ONCVPSP" - if "ONCVPSP" in parsed["PP_HEADER"]["attrib"]["generated"]: - return "ONCVPSP" - - """ADC - ADC is the format of pseudopotential collected in pslibrary, including - pslnc, rrkjus and kjpaw, most collected in QE website the pptable""" - if "ADC" in parsed["PP_INFO"]["data"]: - return "ADC" - if "ADC" in parsed["PP_HEADER"]["attrib"]["generated"]: - return "ADC" - if "ADC" in parsed["PP_HEADER"]["attrib"]["author"]: - return "ADC" - if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_INFO"]["data"]: - return "ADC" - if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["generated"]: - return "ADC" - if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["author"]: - return "ADC" - - """GTH - this is the kind developed by CP2K developers, Goedecker, Hartwigsen, Hutter and Teter - et al. However, this kind of pseudopotential has non-diagonal element in DIJ matrices, - which is not supported by ABACUS yet.""" - if "Goedecker/Hartwigsen/Hutter/Teter" in parsed["PP_HEADER"]["attrib"]["author"]: - return "GTH" - raise NotImplementedError("GTH pseudopotential is not supported by ABACUS yet because of non-diagonal DIJ matrices") - - """GBRV - It is one of the most efficient pseudopotential presently, ABACUS pw supports this kind - of pseudopotential, ABACUS lcao not yet. - """ - if "Generated using Vanderbilt code" in parsed["PP_INFO"]["data"]: - return "GBRV" - - """ATOMPAW - atompaw looks like ADC but not quite the same in occupation information - Comparatively the uni_marburg is actually more similar to ADC""" - if "ATOMPAW" in parsed["PP_INFO"]["data"]: - return "ATOMPAW" - if "ATOMPAW" in parsed["PP_HEADER"]["attrib"]["generated"]: - return "ATOMPAW" - - """vwr - vwr is a old-fashioned pseudopotential format, there are codes can convert the vwr to upf format. Then this branch - will be called to parse the upf format pseudopotential""" - if "vwr" in parsed["PP_INFO"]["data"] or "VWR" in parsed["PP_INFO"]["data"]: - return "VWR" - - print(error_msg, flush = True) - raise TypeError("ERROR: please see error message above.") + try: + """ONCVPSP + ONCVPSP is the format of pseudopotential most seen in norm-conserving pseudopotential, + such as SG15, PD (developed by pwmat team?) and DOJO""" + if "ONCVPSP" in parsed["PP_INFO"]["data"]: + return "ONCVPSP" + if "ONCVPSP" in parsed["PP_HEADER"]["attrib"]["generated"]: + return "ONCVPSP" + + """ADC + ADC is the format of pseudopotential collected in pslibrary, including + pslnc, rrkjus and kjpaw, most collected in QE website the pptable""" + if "ADC" in parsed["PP_INFO"]["data"]: + return "ADC" + if "ADC" in parsed["PP_HEADER"]["attrib"]["generated"]: + return "ADC" + if "ADC" in parsed["PP_HEADER"]["attrib"]["author"]: + return "ADC" + if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_INFO"]["data"]: + return "ADC" + if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["generated"]: + return "ADC" + if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["author"]: + return "ADC" + + """GTH + this is the kind developed by CP2K developers, Goedecker, Hartwigsen, Hutter and Teter + et al. However, this kind of pseudopotential has non-diagonal element in DIJ matrices, + which is not supported by ABACUS yet.""" + if "Goedecker/Hartwigsen/Hutter/Teter" in parsed["PP_HEADER"]["attrib"]["author"]: + return "GTH" + raise NotImplementedError("GTH pseudopotential is not supported by ABACUS yet because of non-diagonal DIJ matrices") + + """GBRV + It is one of the most efficient pseudopotential presently, ABACUS pw supports this kind + of pseudopotential, ABACUS lcao not yet. + """ + if "Generated using Vanderbilt code" in parsed["PP_INFO"]["data"]: + return "GBRV" + + """ATOMPAW + atompaw looks like ADC but not quite the same in occupation information + Comparatively the uni_marburg is actually more similar to ADC""" + if "ATOMPAW" in parsed["PP_INFO"]["data"]: + return "ATOMPAW" + if "ATOMPAW" in parsed["PP_HEADER"]["attrib"]["generated"]: + return "ATOMPAW" + + """vwr + vwr is a old-fashioned pseudopotential format, there are codes can convert the vwr to upf format. Then this branch + will be called to parse the upf format pseudopotential""" + if "vwr" in parsed["PP_INFO"]["data"] or "VWR" in parsed["PP_INFO"]["data"]: + return "VWR" + + except KeyError: + print(error_msg, flush = True) + raise TypeError("ERROR: please see error message above.") def val_conf(parsed: dict): """extract valence electron configuration from pseudopotential file