diff --git a/SIAB/io/pseudopotential/kernel.py b/SIAB/io/pseudopotential/kernel.py
index 6ecbe417..b228720b 100644
--- a/SIAB/io/pseudopotential/kernel.py
+++ b/SIAB/io/pseudopotential/kernel.py
@@ -4,26 +4,52 @@ def iter_tree(root: ET.Element):
return {child.tag: {"attrib": child.attrib, "data": child.text} for child in list(root.iter())}
def preprocess(fname: str):
- """ADC pseudopotential has & symbol at the beginning of line, which is not allowed in xml, replace & with &"""
+ """ Pseudopotential XML file preprocess
+
+ preprocess is relatively hard-coded. There are some cases the UPF file is not standard xml, therefore
+ some modifications are needed. Cases considered:
+ 1. ADC pseudopotential has & symbol at the beginning of line, which is not allowed in xml, replace `&`
+ with `&`
+ 2. GBRV pseudopotential does not startswith , and not endswith , add
+ to the beginning of the file and to the end of the file
+ 3. some of ADC pseudopotentials have `CDATA` tag, which is not allowed in xml, remove them
+ """
+ import re, uuid, os
+ ftemp = f"{str(uuid.uuid3(uuid.NAMESPACE_DNS, fname))}.xml"
+ #print(f"Preprocessing {fname}, will write standard XML formatted temporaray file to {ftemp}")
+ # because a pseudopotential file would not be large, directly read all lines into memory
with open(fname, "r") as f:
lines = f.readlines()
- """GBRV pseudopotential does not startswith , but ,
- add to the beginning of the file and to the end of the file"""
- if not lines[0].startswith("\n")
- lines.append("")
-
- with open(fname, "w") as f:
- for line in lines:
- """if line starts with &, replace & with &,
- but if already &, do not replace"""
- if line.strip().startswith("&") and not line.strip().startswith("&"):
- line = line.replace("&", "&")
-
+ # it is compulsory for all XML files to start with
+ if not lines[0].startswith("\n")
+ # from the last line, check if the first line startswith `<` is not . If not, add to the end of the file
+ i = -1
+ while not lines[i].strip() and i > -len(lines):
+ i -= 1
+ if not "" in lines[i]:
+ lines.append("\n")
+ # write the modified lines back to the file
+ # there are more than one tasks remain:
+ # for ADC pseudopotential, replace `&` with `&`
+ # for PseudoDojo v1.0 pseudopotential, carefully check consistency between opening and closing tags
+ # for ADC pseudopotential or ld1 generated, replace eliminate both `` if they exist
+ lines = [line.replace("", "") for line in lines]
+ with open(ftemp, "w") as f:
+ # replace the `&` symbol at the beginning of the line with `&`, this is done by xml_syntax_filter
+ for _, line in xml_syntax_filter(lines):
+ #for line in lines:
+ _match = re.match(r"^([\s]*)(\&[\w]+)([^;]*)(\s*)$", line)
+ line = f"{_match.group(1)}{_match.group(2).replace('&', '&')}{_match.group(3)}{_match.group(4)}\n"\
+ if _match else line
f.write(line)
-
- if line.strip() == "":
+ if "" in line: # there are some pseudopotential files endswith ppgen file, but will crash the xml parser
break
+ return os.path.abspath(ftemp)
import SIAB.io.pseudopotential.tools.basic as siptb
def postprocess(parsed: dict):
@@ -50,20 +76,22 @@ def postprocess(parsed: dict):
def upf(fname: str):
"""parse the pseudopotential file, return a dictionary"""
+ import os
error_msg = """ERROR: UPF file with non-XML format. Please contact with either developer
of pseudopotential you use or the developer of this package. For the latter choice,
you can submit issue in Github Repository at:
https://github.com/kirk0830/abacus_orbital_generation
, thanks for understanding, raise TypeError and Quit..."""
- preprocess(fname)
+ ftemp = preprocess(fname)
try:
- tree = ET.parse(fname)
+ tree = ET.parse(ftemp)
except ET.ParseError:
print(error_msg, flush=True)
raise TypeError("ERROR: Please read the error message above.") from None
root = tree.getroot()
parsed = iter_tree(root)
postprocess(parsed)
+ os.remove(ftemp)
return parsed
def vwr(fname: str):
@@ -191,6 +219,110 @@ def vwr(fname: str):
}
return out
+def xml_syntax_filter(content: str|list[str]):
+ """it is found that some pseudopotential may have incorrect xml format, this function is to check the consistency of tags
+ and correct if possible. A two-member tuple is returned, the first element is a boolean indicating whether the nearest tag is closed,
+ the second element is the corrected line.
+
+ ```python
+ with open("file.xml", "r") as f:
+ content = f.readlines()
+ for is_closed, line in xml_syntax_filter(content):
+ print(line)
+ ```
+ """
+ import re
+ content = content.split("\n") if isinstance(content, str) else content
+ # for there are tags like "", then it will be a single tag
+ # or
+ buf = ""
+
+ # regular expression example
+ resg = r"<[^>]+/>" # <.../>
+ reop = r"<[^/][^>]+>" # <...>
+ recls = r"[^>]+>" #
+ relicmp = r"<[^>]+" # <...
+ rericmp = r"[^>]+>" # ...>
+ recmt = r"" #
+
+ # the stack to store the names of the tags, when a closing tag is found, it should be the same as the nearest opened tag
+ # otherwise, it is a mismatch error and correction is needed
+
+ # there is an exception caused by the opening comment: 20240506
+ inbuilt_name_stack = []
+ for line in content: # loop over all lines
+ l = line.strip() # copy the value then do the strip operation
+ # XML comments can be add both in one line and across line. However,
+ # only in pslibrary 1.0.0 the across-line-comment is encountered...
+ # but it is not clever to add more complexity (I mean more regular
+ # expressions) to handle this case
+ if re.match(recmt, l):
+ yield True, line
+ # single tag case, with the form <.../>, directly yield it, remember
+ # to yield the original line, rather than the stripped one
+ elif re.match(resg, l):
+ yield True, line
+ # opening tag case, with the form <...>. If this form appears, it
+ # is possible that there are plenty of data belonging to this tag.
+ # To save the tag name into the stack, then yield the original line
+ elif re.match(reop, l):
+ name = re.match(r"<([^ >]+)", l).group(1)
+ inbuilt_name_stack.append(name)
+ yield False, line
+ # closing tag case, with the form , the end of the possibilty
+ # above. If this form appears, then it is the end of the tag, so pop
+ # the tag name from the stack, then yield the original line. However,
+ # for PseudoDojo v1.0 pseudopotential, there would be mismatch between
+ # the opening tag and the closing tag, so need to correct it
+ elif re.match(recls, l):
+ name = re.match(r"([^ >]+)", l).group(1)
+ yield (True, line) if inbuilt_name_stack[-1] == name \
+ else (True, line.replace(name, inbuilt_name_stack[-1]))
+ inbuilt_name_stack.pop()
+ # the case that a "left tag incomplete", means the tag is not closed but like
+ #
+ # in this case, will use buffer to store all content until the tag is closed
+ # then yield-back the complete tag.
+ # however, must be sure the buffer is empty before the next tag
+ # EXCEPTION: read ":
+ assert buf != "", "buffer is empty"
+ buf = " ".join([buf, l])
+ if re.match(resg, buf):
+ yield True, buf + "\n"
+ buf = ""
+ elif re.match(reop, buf):
+ name = re.match(r"<([^ >]+)", buf).group(1)
+ inbuilt_name_stack.append(name)
+ yield False, buf + "\n"
+ buf = ""
+ else:
+ raise ValueError(f"incorrect tag: {buf}")
+ # for not-tag case, might be data, so in this case, do not touch
+ # it. but it can also be within tag, say incomplete tag.
+ else:
+ # if buffer is not empty and the current line does not have any tag
+ # related information, then it would be the situation within the tag
+ # or out of any tags. Fortunately the tag will always start with its
+ # name, or say the symbol `<` always appear to be in the same line
+ # as the tag name, therefore once it is really within the tag, the
+ # buffer will at least has the content of tag name, rather than empty
+ if buf != "":
+ buf = " ".join([buf, l])
+ continue
+ # if buffer is empty, then it is not within any tag, just yield the line
+ else:
+ yield True, line
+
import unittest
class TestPspotKernel(unittest.TestCase):
def test_vwr(self):
diff --git a/SIAB/io/pseudopotential/tools/advanced.py b/SIAB/io/pseudopotential/tools/advanced.py
index ae1a1b05..355440b2 100644
--- a/SIAB/io/pseudopotential/tools/advanced.py
+++ b/SIAB/io/pseudopotential/tools/advanced.py
@@ -3,80 +3,116 @@ def determine_type(parsed: dict):
"""pseudopotentials can be generated by not only one codes, to extract information
from PP_INFO, PP_INPUTFILE, need to know the exact way how information is organized
"""
- error_msg = """ERROR: pseudopotential with not recognized type, this is due to limited support
+ error_msg = """
+##################################################################################################
+! ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR
+##################################################################################################
+
+Pseudopotential with not recognized type, this is due to only limited support
implemented on orbital auto-generation feature. You can submit issue on Github Repository at:
-https://github.com/kirk0830/abacus_orbital_generation, and for instantly use, you can follow the
+https://github.com/kirk0830/abacus_orbital_generation. For instantly use, you can follow the
following guidelines. Please check manually the pseudopotential file you provided and specify:
1. ALL `nbands` with specific number
2. ALL `nbands_ref` with specific number
-3. ALL `orb_conf` and `orb_ref` with specific configuration like either `[0, 1, 2, 3]` or `0s1p2d3f`
-
-ADDITIONALLY! Please check your input, if you are using `optimizer` = `bfgs`, with
-`spill_guess` as `atomic`, a monomer (single atom) calculation will be run to generate the
-initial guess for orbital optimization. You should explicitly create one section in
-`reference_systems` for monomer calculation (setting `shape` to be `monomer`), and specify the
-`nbands` rationally (can be very high, but may also be dangerous because PW calculation cannot
-calculate isolated atom electronic structure well, artifact brought about by PW will pollute the
-energy level information).
-
+3. ALL `lmaxmax` with specific number
+4. ALL `orb_conf` and `orb_ref` with specific configuration like either `[0, 1, 2, 3]` or `0s1p2d3f`
+5. If you are using `optimizer` = `bfgs`, with `spill_guess` as `atomic`, a monomer (single atom)
+calculation will be run to generate the initial guess for orbital optimization. You should explicitly
+create one section in `reference_systems` for monomer calculation (setting `shape` to be `monomer`),
+and specify the `nbands` rationally (can be very high, but may also be dangerous because PW
+calculation cannot calculate isolated atom electronic structure well, artifact brought about by PW
+will pollute the energy level information).
+
+Make sure in your input the following tags are correctly set:
+{
+ "element": [the element symbol],
+
+ "reference_systems": [
+ {
+ "shape": "monomer",
+ "nbands": NUMBER,
+ "nspin": NUMBER,
+ "lmaxmax": NUMBER
+ },
+ {
+ "shape": "dimer",
+ "nbands": NUMBER,
+ "nspin": NUMBER,
+ "lmaxmax": NUMBER,
+ "bond_lengths": (can be "auto")
+ },
+ ...
+ ],
+ "orbitals": [
+ {
+ "zeta_notation": [S#]s[P#]p[D#]d[F#]f,
+ "nbands": NUMBER,
+ "orb_ref": "none" or [S#']s[P#']p[D#']d[F#']f,
+ "shape": (the same),
+ },
+ ...
+ ]
+}
Then rerun the orbital generation.
TypeError raised, Quit..."""
- """ONCVPSP
- ONCVPSP is the format of pseudopotential most seen in norm-conserving pseudopotential,
- such as SG15, PD (developed by pwmat team?) and DOJO"""
- if "ONCVPSP" in parsed["PP_INFO"]["data"]:
- return "ONCVPSP"
- if "ONCVPSP" in parsed["PP_HEADER"]["attrib"]["generated"]:
- return "ONCVPSP"
-
- """ADC
- ADC is the format of pseudopotential collected in pslibrary, including
- pslnc, rrkjus and kjpaw, most collected in QE website the pptable"""
- if "ADC" in parsed["PP_INFO"]["data"]:
- return "ADC"
- if "ADC" in parsed["PP_HEADER"]["attrib"]["generated"]:
- return "ADC"
- if "ADC" in parsed["PP_HEADER"]["attrib"]["author"]:
- return "ADC"
- if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_INFO"]["data"]:
- return "ADC"
- if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["generated"]:
- return "ADC"
- if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["author"]:
- return "ADC"
-
- """GTH
- this is the kind developed by CP2K developers, Goedecker, Hartwigsen, Hutter and Teter
- et al. However, this kind of pseudopotential has non-diagonal element in DIJ matrices,
- which is not supported by ABACUS yet."""
- if "Goedecker/Hartwigsen/Hutter/Teter" in parsed["PP_HEADER"]["attrib"]["author"]:
- return "GTH"
- raise NotImplementedError("GTH pseudopotential is not supported by ABACUS yet because of non-diagonal DIJ matrices")
-
- """GBRV
- It is one of the most efficient pseudopotential presently, ABACUS pw supports this kind
- of pseudopotential, ABACUS lcao not yet.
- """
- if "Generated using Vanderbilt code" in parsed["PP_INFO"]["data"]:
- return "GBRV"
-
- """ATOMPAW
- atompaw looks like ADC but not quite the same in occupation information
- Comparatively the uni_marburg is actually more similar to ADC"""
- if "ATOMPAW" in parsed["PP_INFO"]["data"]:
- return "ATOMPAW"
- if "ATOMPAW" in parsed["PP_HEADER"]["attrib"]["generated"]:
- return "ATOMPAW"
-
- """vwr
- vwr is a old-fashioned pseudopotential format, there are codes can convert the vwr to upf format. Then this branch
- will be called to parse the upf format pseudopotential"""
- if "vwr" in parsed["PP_INFO"]["data"] or "VWR" in parsed["PP_INFO"]["data"]:
- return "VWR"
-
- print(error_msg, flush = True)
- raise TypeError("ERROR: please see error message above.")
+ try:
+ """ONCVPSP
+ ONCVPSP is the format of pseudopotential most seen in norm-conserving pseudopotential,
+ such as SG15, PD (developed by pwmat team?) and DOJO"""
+ if "ONCVPSP" in parsed["PP_INFO"]["data"]:
+ return "ONCVPSP"
+ if "ONCVPSP" in parsed["PP_HEADER"]["attrib"]["generated"]:
+ return "ONCVPSP"
+
+ """ADC
+ ADC is the format of pseudopotential collected in pslibrary, including
+ pslnc, rrkjus and kjpaw, most collected in QE website the pptable"""
+ if "ADC" in parsed["PP_INFO"]["data"]:
+ return "ADC"
+ if "ADC" in parsed["PP_HEADER"]["attrib"]["generated"]:
+ return "ADC"
+ if "ADC" in parsed["PP_HEADER"]["attrib"]["author"]:
+ return "ADC"
+ if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_INFO"]["data"]:
+ return "ADC"
+ if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["generated"]:
+ return "ADC"
+ if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["author"]:
+ return "ADC"
+
+ """GTH
+ this is the kind developed by CP2K developers, Goedecker, Hartwigsen, Hutter and Teter
+ et al. However, this kind of pseudopotential has non-diagonal element in DIJ matrices,
+ which is not supported by ABACUS yet."""
+ if "Goedecker/Hartwigsen/Hutter/Teter" in parsed["PP_HEADER"]["attrib"]["author"]:
+ return "GTH"
+ raise NotImplementedError("GTH pseudopotential is not supported by ABACUS yet because of non-diagonal DIJ matrices")
+
+ """GBRV
+ It is one of the most efficient pseudopotential presently, ABACUS pw supports this kind
+ of pseudopotential, ABACUS lcao not yet.
+ """
+ if "Generated using Vanderbilt code" in parsed["PP_INFO"]["data"]:
+ return "GBRV"
+
+ """ATOMPAW
+ atompaw looks like ADC but not quite the same in occupation information
+ Comparatively the uni_marburg is actually more similar to ADC"""
+ if "ATOMPAW" in parsed["PP_INFO"]["data"]:
+ return "ATOMPAW"
+ if "ATOMPAW" in parsed["PP_HEADER"]["attrib"]["generated"]:
+ return "ATOMPAW"
+
+ """vwr
+ vwr is a old-fashioned pseudopotential format, there are codes can convert the vwr to upf format. Then this branch
+ will be called to parse the upf format pseudopotential"""
+ if "vwr" in parsed["PP_INFO"]["data"] or "VWR" in parsed["PP_INFO"]["data"]:
+ return "VWR"
+
+ except KeyError:
+ print(error_msg, flush = True)
+ raise TypeError("ERROR: please see error message above.")
def val_conf(parsed: dict):
"""extract valence electron configuration from pseudopotential file