diff --git a/SConstruct b/SConstruct index 8ff9d4f41eb..b5a6ce231bf 100644 --- a/SConstruct +++ b/SConstruct @@ -962,7 +962,8 @@ def blender_libs(env): 'blender_LOD', 'blender_BSP', 'blender_blenkernel', - 'blender_IK']) + 'blender_IK', + 'blender_ONL']) def ketsji_libs(env): """ diff --git a/intern/Makefile b/intern/Makefile index 08ab03150fe..af64e44cdf4 100644 --- a/intern/Makefile +++ b/intern/Makefile @@ -35,7 +35,7 @@ SOURCEDIR = intern # include nan_subdirs.mk ALLDIRS = string ghost guardedalloc bmfont moto container memutil -ALLDIRS += decimation iksolver bsp SoundSystem +ALLDIRS += decimation iksolver bsp SoundSystem opennl all:: @for i in $(ALLDIRS); do \ diff --git a/intern/SConscript b/intern/SConscript index 833a0316634..afbcd24b8be 100644 --- a/intern/SConscript +++ b/intern/SConscript @@ -8,7 +8,8 @@ SConscript(['SoundSystem/SConscript', 'container/SConscript', 'memutil/SConscript/', 'decimation/SConscript', - 'iksolver/SConscript']) + 'iksolver/SConscript', + 'opennl/SConscript']) NEW_CSG='false' diff --git a/intern/opennl/Makefile b/intern/opennl/Makefile new file mode 100644 index 00000000000..8aa0d4f590b --- /dev/null +++ b/intern/opennl/Makefile @@ -0,0 +1,67 @@ +# +# $Id$ +# +# ***** BEGIN GPL/BL DUAL LICENSE BLOCK ***** +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. The Blender +# Foundation also sells licenses for use in proprietary software under +# the Blender License. See http://www.blender.org/BL/ for information +# about this. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# The Original Code is Copyright (C) 2001-2002 by NaN Holding BV. +# All rights reserved. +# +# The Original Code is: all of this file. +# +# Contributor(s): Hans Lambermont +# +# ***** END GPL/BL DUAL LICENSE BLOCK ***** +# opennl main makefile. +# + +include nan_definitions.mk + +LIBNAME = opennl +LIBNAME_SLU = superlu +SOURCEDIR = intern/$(LIBNAME) +SOURCEDIR_SLU = intern/$(LIBNAME_SLU) +DIR = $(OCGDIR)/$(SOURCEDIR) +DIR_SLU = $(OCGDIR)/$(SOURCEDIR_SLU) +DIRS = intern superlu + +include nan_subdirs.mk + +install: all debug + @[ -d $(NAN_OPENNL) ] || mkdir $(NAN_OPENNL) + @[ -d $(NAN_OPENNL)/include ] || mkdir $(NAN_OPENNL)/include + @[ -d $(NAN_OPENNL)/lib ] || mkdir $(NAN_OPENNL)/lib + @[ -d $(NAN_OPENNL)/lib/debug ] || mkdir $(NAN_OPENNL)/lib/debug + @../tools/cpifdiff.sh $(DIR)/libopennl.a $(NAN_OPENNL)/lib/ + @../tools/cpifdiff.sh $(DIR)/debug/libopennl.a $(NAN_OPENNL)/lib/debug/ +ifeq ($(OS),darwin) + ranlib $(NAN_OPENNL)/lib/libopennl.a + ranlib $(NAN_OPENNL)/lib/debug/libopennl.a +endif + @../tools/cpifdiff.sh extern/*.h $(NAN_OPENNL)/include/ + @[ -d $(NAN_SUPERLU) ] || mkdir $(NAN_SUPERLU) + @[ -d $(NAN_SUPERLU)/lib ] || mkdir $(NAN_SUPERLU)/lib + @[ -d $(NAN_SUPERLU)/lib/debug ] || mkdir $(NAN_SUPERLU)/lib/debug + @../tools/cpifdiff.sh $(DIR_SLU)/libsuperlu.a $(NAN_SUPERLU)/lib/ + @../tools/cpifdiff.sh $(DIR_SLU)/debug/libsuperlu.a $(NAN_SUPERLU)/lib/debug/ +ifeq ($(OS),darwin) + ranlib $(NAN_SUPERLU)/lib/libsuperlu.a + ranlib $(NAN_SUPERLU)/lib/debug/libsuperlu.a +endif + diff --git a/intern/opennl/SConscript b/intern/opennl/SConscript new file mode 100644 index 00000000000..4e0260c7f33 --- /dev/null +++ b/intern/opennl/SConscript @@ -0,0 +1,43 @@ +Import ('user_options_dict') +Import ('library_env') + +opennl_env = library_env.Copy () + +source_files = ['intern/opennl.c', + 'superlu/colamd.c', + 'superlu/get_perm_c.c', + 'superlu/heap_relax_snode.c', + 'superlu/lsame.c', + 'superlu/memory.c', + 'superlu/mmd.c', + 'superlu/relax_snode.c', + 'superlu/scolumn_bmod.c', + 'superlu/scolumn_dfs.c', + 'superlu/scopy_to_ucol.c', + 'superlu/sgssv.c', + 'superlu/sgstrf.c', + 'superlu/sgstrs.c', + 'superlu/smemory.c', + 'superlu/smyblas2.c', + 'superlu/sp_coletree.c', + 'superlu/sp_ienv.c', + 'superlu/sp_preorder.c', + 'superlu/spanel_bmod.c', + 'superlu/spanel_dfs.c', + 'superlu/spivotL.c', + 'superlu/spruneL.c', + 'superlu/ssnode_bmod.c', + 'superlu/ssnode_dfs.c', + 'superlu/ssp_blas2.c', + 'superlu/ssp_blas3.c', + 'superlu/strsv.c', + 'superlu/superlu_timer.c', + 'superlu/sutil.c', + 'superlu/util.c', + 'superlu/xerbla.c'] + +opennl_env.Append (CPPPATH = ['extern', + 'superlu']) + +opennl_env.Library (target='#'+user_options_dict['BUILD_DIR']+'/lib/blender_ONL', source=source_files) + diff --git a/intern/opennl/doc/OpenNL_License.txt b/intern/opennl/doc/OpenNL_License.txt new file mode 100644 index 00000000000..4e8d97fd526 --- /dev/null +++ b/intern/opennl/doc/OpenNL_License.txt @@ -0,0 +1,341 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. + diff --git a/intern/opennl/doc/OpenNL_Readme.txt b/intern/opennl/doc/OpenNL_Readme.txt new file mode 100644 index 00000000000..e6aea3c0286 --- /dev/null +++ b/intern/opennl/doc/OpenNL_Readme.txt @@ -0,0 +1,13 @@ + +This is OpenNL, a library to easily construct and solve sparse linear systems. +* OpenNL is supplied with a set of iterative solvers (Conjugate gradient, + BICGSTAB, GMRes) and preconditioners (Jacobi, SSOR). +* OpenNL can also use other solvers (SuperLU 3.0 supported as an OpenNL + extension) + +Note that to be compatible with OpenNL, SuperLU 3.0 needs to be compiled with +the following flag (see make.inc in SuperLU3.0): +CDEFS = -DAdd_ (the default is -DAdd__, just remove the second underscore) + +OpenNL was modified for Blender to be used only as a wrapper for SuperLU. + diff --git a/intern/opennl/doc/SuperLU_License.txt b/intern/opennl/doc/SuperLU_License.txt new file mode 100644 index 00000000000..f31a01782e2 --- /dev/null +++ b/intern/opennl/doc/SuperLU_License.txt @@ -0,0 +1,31 @@ +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, +are permitted provided that the following conditions are met: + +(1) Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +(2) Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. +(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of +Energy nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/intern/opennl/doc/SuperLU_Readme.txt b/intern/opennl/doc/SuperLU_Readme.txt new file mode 100644 index 00000000000..c1cedd09893 --- /dev/null +++ b/intern/opennl/doc/SuperLU_Readme.txt @@ -0,0 +1,52 @@ + SuperLU (Version 3.0) + ===================== + +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +(1) Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +(2) Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. +(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of +Energy nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +SuperLU contains a set of subroutines to solve a sparse linear system +A*X=B. It uses Gaussian elimination with partial pivoting (GEPP). +The columns of A may be preordered before factorization; the +preordering for sparsity is completely separate from the factorization. + +SuperLU is implemented in ANSI C, and must be compiled with standard +ANSI C compilers. It provides functionality for both real and complex +matrices, in both single and double precision. The file names for the +single-precision real version start with letter "s" (such as sgstrf.c); +the file names for the double-precision real version start with letter "d" +(such as dgstrf.c); the file names for the single-precision complex +version start with letter "c" (such as cgstrf.c); the file names +for the double-precision complex version start with letter "z" +(such as zgstrf.c). + +SuperLU was modified for Blender to only include single-precision +functionality. + diff --git a/intern/opennl/extern/ONL_opennl.h b/intern/opennl/extern/ONL_opennl.h new file mode 100644 index 00000000000..5e4bd24313c --- /dev/null +++ b/intern/opennl/extern/ONL_opennl.h @@ -0,0 +1,163 @@ +/* + * $Id$ + * + * OpenNL: Numerical Library + * Copyright (C) 2004 Bruno Levy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * If you modify this software, you should include a notice giving the + * name of the person performing the modification, the date of modification, + * and the reason for such modification. + * + * Contact: Bruno Levy + * + * levy@loria.fr + * + * ISA Project + * LORIA, INRIA Lorraine, + * Campus Scientifique, BP 239 + * 54506 VANDOEUVRE LES NANCY CEDEX + * FRANCE + * + * Note that the GNU General Public License does not permit incorporating + * the Software into proprietary programs. + */ + +/* +#define NL_DEBUG +#define NL_PARANOID +*/ + +#define NL_USE_SUPERLU + +#ifndef nlOPENNL_H +#define nlOPENNL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define NL_VERSION_0_0 1 + +/* + * + * Datatypes + * + */ + +typedef unsigned int NLenum; +typedef unsigned char NLboolean; +typedef unsigned int NLbitfield; +typedef void NLvoid; +typedef signed char NLbyte; /* 1-byte signed */ +typedef short NLshort; /* 2-byte signed */ +typedef int NLint; /* 4-byte signed */ +typedef unsigned char NLubyte; /* 1-byte unsigned */ +typedef unsigned short NLushort; /* 2-byte unsigned */ +typedef unsigned int NLuint; /* 4-byte unsigned */ +typedef int NLsizei; /* 4-byte signed */ +typedef float NLfloat; /* single precision float */ +typedef double NLdouble; /* double precision float */ + +typedef void* NLContext ; + +/* + * + * Constants + * + */ + +#define NL_FALSE 0x0 +#define NL_TRUE 0x1 + +/* Primitives */ + +#define NL_SYSTEM 0x0 +#define NL_MATRIX 0x1 +#define NL_ROW 0x2 + +/* Solver Parameters */ + +#define NL_SOLVER 0x100 +#define NL_NB_VARIABLES 0x101 +#define NL_LEAST_SQUARES 0x102 +#define NL_SYMMETRIC 0x106 +#define NL_ERROR 0x108 + +/* Enable / Disable */ + +#define NL_NORMALIZE_ROWS 0x400 + +/* Row parameters */ + +#define NL_RIGHT_HAND_SIDE 0x500 +#define NL_ROW_SCALING 0x501 + +/* + * Contexts + */ + NLContext nlNewContext() ; + void nlDeleteContext(NLContext context) ; + void nlMakeCurrent(NLContext context) ; + NLContext nlGetCurrent() ; + +/* + * State set/get + */ + + void nlSolverParameterf(NLenum pname, NLfloat param) ; + void nlSolverParameteri(NLenum pname, NLint param) ; + + void nlRowParameterf(NLenum pname, NLfloat param) ; + void nlRowParameteri(NLenum pname, NLint param) ; + + void nlGetBooleanv(NLenum pname, NLboolean* params) ; + void nlGetFloatv(NLenum pname, NLfloat* params) ; + void nlGetIntergerv(NLenum pname, NLint* params) ; + + void nlEnable(NLenum pname) ; + void nlDisable(NLenum pname) ; + NLboolean nlIsEnabled(NLenum pname) ; + +/* + * Variables + */ + void nlSetVariable(NLuint index, NLfloat value) ; + NLfloat nlGetVariable(NLuint index) ; + void nlLockVariable(NLuint index) ; + void nlUnlockVariable(NLuint index) ; + NLboolean nlVariableIsLocked(NLuint index) ; + +/* + * Begin/End + */ + + void nlBegin(NLenum primitive) ; + void nlEnd(NLenum primitive) ; + void nlCoefficient(NLuint index, NLfloat value) ; + +/* + * Solve + */ + + NLboolean nlSolve() ; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/intern/opennl/intern/Makefile b/intern/opennl/intern/Makefile new file mode 100644 index 00000000000..2e57905d931 --- /dev/null +++ b/intern/opennl/intern/Makefile @@ -0,0 +1,43 @@ +# +# $Id$ +# +# ***** BEGIN GPL/BL DUAL LICENSE BLOCK ***** +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. The Blender +# Foundation also sells licenses for use in proprietary software under +# the Blender License. See http://www.blender.org/BL/ for information +# about this. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# The Original Code is Copyright (C) 2001-2002 by NaN Holding BV. +# All rights reserved. +# +# The Original Code is: all of this file. +# +# Contributor(s): none yet. +# +# ***** END GPL/BL DUAL LICENSE BLOCK ***** +# opennl intern Makefile +# + +LIBNAME = opennl +DIR = $(OCGDIR)/intern/$(LIBNAME) + +include nan_compile.mk + +CCFLAGS += $(NAN_LEVEL_2_CPP_WARNINGS) + +CPPFLAGS += -I../superlu -I../extern + + diff --git a/intern/opennl/intern/opennl.c b/intern/opennl/intern/opennl.c new file mode 100644 index 00000000000..be797223f51 --- /dev/null +++ b/intern/opennl/intern/opennl.c @@ -0,0 +1,1151 @@ +/* + * $Id$ + * + * OpenNL: Numerical Library + * Copyright (C) 2004 Bruno Levy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * If you modify this software, you should include a notice giving the + * name of the person performing the modification, the date of modification, + * and the reason for such modification. + * + * Contact: Bruno Levy + * + * levy@loria.fr + * + * ISA Project + * LORIA, INRIA Lorraine, + * Campus Scientifique, BP 239 + * 54506 VANDOEUVRE LES NANCY CEDEX + * FRANCE + * + * Note that the GNU General Public License does not permit incorporating + * the Software into proprietary programs. + */ + +#include "ONL_opennl.h" + +#include +#include +#include +#include + +#ifdef NL_PARANOID +#ifndef NL_DEBUG +#define NL_DEBUG +#endif +#endif + +/* SuperLU includes */ +#include +#include + +/************************************************************************************/ +/* Assertions */ + + +static void __nl_assertion_failed(char* cond, char* file, int line) { + fprintf( + stderr, + "OpenNL assertion failed: %s, file:%s, line:%d\n", + cond,file,line + ) ; + abort() ; +} + +static void __nl_range_assertion_failed( + float x, float min_val, float max_val, char* file, int line +) { + fprintf( + stderr, + "OpenNL range assertion failed: %f in [ %f ... %f ], file:%s, line:%d\n", + x, min_val, max_val, file,line + ) ; + abort() ; +} + +static void __nl_should_not_have_reached(char* file, int line) { + fprintf( + stderr, + "OpenNL should not have reached this point: file:%s, line:%d\n", + file,line + ) ; + abort() ; +} + + +#define __nl_assert(x) { \ + if(!(x)) { \ + __nl_assertion_failed(#x,__FILE__, __LINE__) ; \ + } \ +} + +#define __nl_range_assert(x,min_val,max_val) { \ + if(((x) < (min_val)) || ((x) > (max_val))) { \ + __nl_range_assertion_failed(x, min_val, max_val, \ + __FILE__, __LINE__ \ + ) ; \ + } \ +} + +#define __nl_assert_not_reached { \ + __nl_should_not_have_reached(__FILE__, __LINE__) ; \ +} + +#ifdef NL_DEBUG +#define __nl_debug_assert(x) __nl_assert(x) +#define __nl_debug_range_assert(x,min_val,max_val) __nl_range_assert(x,min_val,max_val) +#else +#define __nl_debug_assert(x) +#define __nl_debug_range_assert(x,min_val,max_val) +#endif + +#ifdef NL_PARANOID +#define __nl_parano_assert(x) __nl_assert(x) +#define __nl_parano_range_assert(x,min_val,max_val) __nl_range_assert(x,min_val,max_val) +#else +#define __nl_parano_assert(x) +#define __nl_parano_range_assert(x,min_val,max_val) +#endif + +/************************************************************************************/ +/* classic macros */ + +#ifndef MIN +#define MIN(x,y) (((x) < (y)) ? (x) : (y)) +#endif + +#ifndef MAX +#define MAX(x,y) (((x) > (y)) ? (x) : (y)) +#endif + +/************************************************************************************/ +/* memory management */ + +#define __NL_NEW(T) (T*)(calloc(1, sizeof(T))) +#define __NL_NEW_ARRAY(T,NB) (T*)(calloc((NB),sizeof(T))) +#define __NL_RENEW_ARRAY(T,x,NB) (T*)(realloc(x,(NB)*sizeof(T))) +#define __NL_DELETE(x) free(x); x = NULL +#define __NL_DELETE_ARRAY(x) free(x); x = NULL + +#define __NL_CLEAR(T, x) memset(x, 0, sizeof(T)) +#define __NL_CLEAR_ARRAY(T,x,NB) memset(x, 0, (NB)*sizeof(T)) + +/************************************************************************************/ +/* Dynamic arrays for sparse row/columns */ + +typedef struct { + NLuint index ; + NLfloat value ; +} __NLCoeff ; + +typedef struct { + NLuint size ; + NLuint capacity ; + __NLCoeff* coeff ; +} __NLRowColumn ; + +static void __nlRowColumnConstruct(__NLRowColumn* c) { + c->size = 0 ; + c->capacity = 0 ; + c->coeff = NULL ; +} + +static void __nlRowColumnDestroy(__NLRowColumn* c) { + __NL_DELETE_ARRAY(c->coeff) ; +#ifdef NL_PARANOID + __NL_CLEAR(__NLRowColumn, c) ; +#endif +} + +static void __nlRowColumnGrow(__NLRowColumn* c) { + if(c->capacity != 0) { + c->capacity = 2 * c->capacity ; + c->coeff = __NL_RENEW_ARRAY(__NLCoeff, c->coeff, c->capacity) ; + } else { + c->capacity = 4 ; + c->coeff = __NL_NEW_ARRAY(__NLCoeff, c->capacity) ; + } +} + +static void __nlRowColumnAdd(__NLRowColumn* c, NLint index, NLfloat value) { + NLuint i ; + for(i=0; isize; i++) { + if(c->coeff[i].index == (NLuint)index) { + c->coeff[i].value += value ; + return ; + } + } + if(c->size == c->capacity) { + __nlRowColumnGrow(c) ; + } + c->coeff[c->size].index = index ; + c->coeff[c->size].value = value ; + c->size++ ; +} + +/* Does not check whether the index already exists */ +static void __nlRowColumnAppend(__NLRowColumn* c, NLint index, NLfloat value) { + if(c->size == c->capacity) { + __nlRowColumnGrow(c) ; + } + c->coeff[c->size].index = index ; + c->coeff[c->size].value = value ; + c->size++ ; +} + +static void __nlRowColumnZero(__NLRowColumn* c) { + c->size = 0 ; +} + +static void __nlRowColumnClear(__NLRowColumn* c) { + c->size = 0 ; + c->capacity = 0 ; + __NL_DELETE_ARRAY(c->coeff) ; +} + +/************************************************************************************/ +/* SparseMatrix data structure */ + +#define __NL_ROWS 1 +#define __NL_COLUMNS 2 +#define __NL_SYMMETRIC 4 + +typedef struct { + NLuint m ; + NLuint n ; + NLuint diag_size ; + NLenum storage ; + __NLRowColumn* row ; + __NLRowColumn* column ; + NLfloat* diag ; +} __NLSparseMatrix ; + + +static void __nlSparseMatrixConstruct( + __NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage +) { + NLuint i ; + M->m = m ; + M->n = n ; + M->storage = storage ; + if(storage & __NL_ROWS) { + M->row = __NL_NEW_ARRAY(__NLRowColumn, m) ; + for(i=0; irow[i])) ; + } + } else { + M->row = NULL ; + } + + if(storage & __NL_COLUMNS) { + M->column = __NL_NEW_ARRAY(__NLRowColumn, n) ; + for(i=0; icolumn[i])) ; + } + } else { + M->column = NULL ; + } + + M->diag_size = MIN(m,n) ; + M->diag = __NL_NEW_ARRAY(NLfloat, M->diag_size) ; +} + +static void __nlSparseMatrixDestroy(__NLSparseMatrix* M) { + NLuint i ; + __NL_DELETE_ARRAY(M->diag) ; + if(M->storage & __NL_ROWS) { + for(i=0; im; i++) { + __nlRowColumnDestroy(&(M->row[i])) ; + } + __NL_DELETE_ARRAY(M->row) ; + } + if(M->storage & __NL_COLUMNS) { + for(i=0; in; i++) { + __nlRowColumnDestroy(&(M->column[i])) ; + } + __NL_DELETE_ARRAY(M->column) ; + } +#ifdef NL_PARANOID + __NL_CLEAR(__NLSparseMatrix,M) ; +#endif +} + +static void __nlSparseMatrixAdd( + __NLSparseMatrix* M, NLuint i, NLuint j, NLfloat value +) { + __nl_parano_range_assert(i, 0, M->m - 1) ; + __nl_parano_range_assert(j, 0, M->n - 1) ; + if((M->storage & __NL_SYMMETRIC) && (j > i)) { + return ; + } + if(i == j) { + M->diag[i] += value ; + } + if(M->storage & __NL_ROWS) { + __nlRowColumnAdd(&(M->row[i]), j, value) ; + } + if(M->storage & __NL_COLUMNS) { + __nlRowColumnAdd(&(M->column[j]), i, value) ; + } +} + +static void __nlSparseMatrixClear( __NLSparseMatrix* M) { + NLuint i ; + if(M->storage & __NL_ROWS) { + for(i=0; im; i++) { + __nlRowColumnClear(&(M->row[i])) ; + } + } + if(M->storage & __NL_COLUMNS) { + for(i=0; in; i++) { + __nlRowColumnClear(&(M->column[i])) ; + } + } + __NL_CLEAR_ARRAY(NLfloat, M->diag, M->diag_size) ; +} + +/* Returns the number of non-zero coefficients */ +static NLuint __nlSparseMatrixNNZ( __NLSparseMatrix* M) { + NLuint nnz = 0 ; + NLuint i ; + if(M->storage & __NL_ROWS) { + for(i = 0; im; i++) { + nnz += M->row[i].size ; + } + } else if (M->storage & __NL_COLUMNS) { + for(i = 0; in; i++) { + nnz += M->column[i].size ; + } + } else { + __nl_assert_not_reached ; + } + return nnz ; +} + +/************************************************************************************/ +/* SparseMatrix x Vector routines, internal helper routines */ + +static void __nlSparseMatrix_mult_rows_symmetric( + __NLSparseMatrix* A, NLfloat* x, NLfloat* y +) { + NLuint m = A->m ; + NLuint i,ij ; + __NLRowColumn* Ri = NULL ; + __NLCoeff* c = NULL ; + for(i=0; irow[i]) ; + for(ij=0; ijsize; ij++) { + c = &(Ri->coeff[ij]) ; + y[i] += c->value * x[c->index] ; + if(i != c->index) { + y[c->index] += c->value * x[i] ; + } + } + } +} + +static void __nlSparseMatrix_mult_rows( + __NLSparseMatrix* A, NLfloat* x, NLfloat* y +) { + NLuint m = A->m ; + NLuint i,ij ; + __NLRowColumn* Ri = NULL ; + __NLCoeff* c = NULL ; + for(i=0; irow[i]) ; + for(ij=0; ijsize; ij++) { + c = &(Ri->coeff[ij]) ; + y[i] += c->value * x[c->index] ; + } + } +} + +static void __nlSparseMatrix_mult_cols_symmetric( + __NLSparseMatrix* A, NLfloat* x, NLfloat* y +) { + NLuint n = A->n ; + NLuint j,ii ; + __NLRowColumn* Cj = NULL ; + __NLCoeff* c = NULL ; + for(j=0; jcolumn[j]) ; + for(ii=0; iisize; ii++) { + c = &(Cj->coeff[ii]) ; + y[c->index] += c->value * x[j] ; + if(j != c->index) { + y[j] += c->value * x[c->index] ; + } + } + } +} + +static void __nlSparseMatrix_mult_cols( + __NLSparseMatrix* A, NLfloat* x, NLfloat* y +) { + NLuint n = A->n ; + NLuint j,ii ; + __NLRowColumn* Cj = NULL ; + __NLCoeff* c = NULL ; + __NL_CLEAR_ARRAY(NLfloat, y, A->m) ; + for(j=0; jcolumn[j]) ; + for(ii=0; iisize; ii++) { + c = &(Cj->coeff[ii]) ; + y[c->index] += c->value * x[j] ; + } + } +} + +/************************************************************************************/ +/* SparseMatrix x Vector routines, main driver routine */ + +void __nlSparseMatrixMult(__NLSparseMatrix* A, NLfloat* x, NLfloat* y) { + if(A->storage & __NL_ROWS) { + if(A->storage & __NL_SYMMETRIC) { + __nlSparseMatrix_mult_rows_symmetric(A, x, y) ; + } else { + __nlSparseMatrix_mult_rows(A, x, y) ; + } + } else { + if(A->storage & __NL_SYMMETRIC) { + __nlSparseMatrix_mult_cols_symmetric(A, x, y) ; + } else { + __nlSparseMatrix_mult_cols(A, x, y) ; + } + } +} + +/************************************************************************************/ +/* NLContext data structure */ + +typedef void(*__NLMatrixFunc)(float* x, float* y) ; + +typedef struct { + NLfloat value ; + NLboolean locked ; + NLuint index ; +} __NLVariable ; + +#define __NL_STATE_INITIAL 0 +#define __NL_STATE_SYSTEM 1 +#define __NL_STATE_MATRIX 2 +#define __NL_STATE_ROW 3 +#define __NL_STATE_MATRIX_CONSTRUCTED 4 +#define __NL_STATE_SYSTEM_CONSTRUCTED 5 +#define __NL_STATE_SOLVED 6 + +typedef struct { + NLenum state ; + __NLVariable* variable ; + NLuint n ; + __NLSparseMatrix M ; + __NLRowColumn af ; + __NLRowColumn al ; + __NLRowColumn xl ; + NLfloat* x ; + NLfloat* b ; + NLfloat right_hand_side ; + NLfloat row_scaling ; + NLuint nb_variables ; + NLuint current_row ; + NLboolean least_squares ; + NLboolean symmetric ; + NLboolean normalize_rows ; + NLboolean alloc_M ; + NLboolean alloc_af ; + NLboolean alloc_al ; + NLboolean alloc_xl ; + NLboolean alloc_variable ; + NLboolean alloc_x ; + NLboolean alloc_b ; + NLfloat error ; + __NLMatrixFunc matrix_vector_prod ; +} __NLContext ; + +static __NLContext* __nlCurrentContext = NULL ; + +void __nlMatrixVectorProd_default(NLfloat* x, NLfloat* y) { + __nlSparseMatrixMult(&(__nlCurrentContext->M), x, y) ; +} + + +NLContext nlNewContext() { + __NLContext* result = __NL_NEW(__NLContext) ; + result->state = __NL_STATE_INITIAL ; + result->row_scaling = 1.0 ; + result->right_hand_side = 0.0 ; + result->matrix_vector_prod = __nlMatrixVectorProd_default ; + nlMakeCurrent(result) ; + return result ; +} + +void nlDeleteContext(NLContext context_in) { + __NLContext* context = (__NLContext*)(context_in) ; + if(__nlCurrentContext == context) { + __nlCurrentContext = NULL ; + } + if(context->alloc_M) { + __nlSparseMatrixDestroy(&context->M) ; + } + if(context->alloc_af) { + __nlRowColumnDestroy(&context->af) ; + } + if(context->alloc_al) { + __nlRowColumnDestroy(&context->al) ; + } + if(context->alloc_xl) { + __nlRowColumnDestroy(&context->xl) ; + } + if(context->alloc_variable) { + __NL_DELETE_ARRAY(context->variable) ; + } + if(context->alloc_x) { + __NL_DELETE_ARRAY(context->x) ; + } + if(context->alloc_b) { + __NL_DELETE_ARRAY(context->b) ; + } + +#ifdef NL_PARANOID + __NL_CLEAR(__NLContext, context) ; +#endif + __NL_DELETE(context) ; +} + +void nlMakeCurrent(NLContext context) { + __nlCurrentContext = (__NLContext*)(context) ; +} + +NLContext nlGetCurrent() { + return __nlCurrentContext ; +} + +void __nlCheckState(NLenum state) { + __nl_assert(__nlCurrentContext->state == state) ; +} + +void __nlTransition(NLenum from_state, NLenum to_state) { + __nlCheckState(from_state) ; + __nlCurrentContext->state = to_state ; +} + +/************************************************************************************/ +/* Get/Set parameters */ + +void nlSolverParameterf(NLenum pname, NLfloat param) { + __nlCheckState(__NL_STATE_INITIAL) ; + switch(pname) { + case NL_NB_VARIABLES: { + __nl_assert(param > 0) ; + __nlCurrentContext->nb_variables = (NLuint)param ; + } break ; + case NL_LEAST_SQUARES: { + __nlCurrentContext->least_squares = (NLboolean)param ; + } break ; + case NL_SYMMETRIC: { + __nlCurrentContext->symmetric = (NLboolean)param ; + } + default: { + __nl_assert_not_reached ; + } break ; + } +} + +void nlSolverParameteri(NLenum pname, NLint param) { + __nlCheckState(__NL_STATE_INITIAL) ; + switch(pname) { + case NL_NB_VARIABLES: { + __nl_assert(param > 0) ; + __nlCurrentContext->nb_variables = (NLuint)param ; + } break ; + case NL_LEAST_SQUARES: { + __nlCurrentContext->least_squares = (NLboolean)param ; + } break ; + case NL_SYMMETRIC: { + __nlCurrentContext->symmetric = (NLboolean)param ; + } + default: { + __nl_assert_not_reached ; + } break ; + } +} + +void nlRowParameterf(NLenum pname, NLfloat param) { + __nlCheckState(__NL_STATE_MATRIX) ; + switch(pname) { + case NL_RIGHT_HAND_SIDE: { + __nlCurrentContext->right_hand_side = param ; + } break ; + case NL_ROW_SCALING: { + __nlCurrentContext->row_scaling = param ; + } break ; + } +} + +void nlRowParameteri(NLenum pname, NLint param) { + __nlCheckState(__NL_STATE_MATRIX) ; + switch(pname) { + case NL_RIGHT_HAND_SIDE: { + __nlCurrentContext->right_hand_side = (NLfloat)param ; + } break ; + case NL_ROW_SCALING: { + __nlCurrentContext->row_scaling = (NLfloat)param ; + } break ; + } +} + +void nlGetBooleanv(NLenum pname, NLboolean* params) { + switch(pname) { + case NL_LEAST_SQUARES: { + *params = __nlCurrentContext->least_squares ; + } break ; + case NL_SYMMETRIC: { + *params = __nlCurrentContext->symmetric ; + } break ; + default: { + __nl_assert_not_reached ; + } break ; + } +} + +void nlGetFloatv(NLenum pname, NLfloat* params) { + switch(pname) { + case NL_NB_VARIABLES: { + *params = (NLfloat)(__nlCurrentContext->nb_variables) ; + } break ; + case NL_LEAST_SQUARES: { + *params = (NLfloat)(__nlCurrentContext->least_squares) ; + } break ; + case NL_SYMMETRIC: { + *params = (NLfloat)(__nlCurrentContext->symmetric) ; + } break ; + case NL_ERROR: { + *params = (NLfloat)(__nlCurrentContext->error) ; + } break ; + default: { + __nl_assert_not_reached ; + } break ; + } +} + +void nlGetIntergerv(NLenum pname, NLint* params) { + switch(pname) { + case NL_NB_VARIABLES: { + *params = (NLint)(__nlCurrentContext->nb_variables) ; + } break ; + case NL_LEAST_SQUARES: { + *params = (NLint)(__nlCurrentContext->least_squares) ; + } break ; + case NL_SYMMETRIC: { + *params = (NLint)(__nlCurrentContext->symmetric) ; + } break ; + default: { + __nl_assert_not_reached ; + } break ; + } +} + +/************************************************************************************/ +/* Enable / Disable */ + +void nlEnable(NLenum pname) { + switch(pname) { + case NL_NORMALIZE_ROWS: { + __nl_assert(__nlCurrentContext->state != __NL_STATE_ROW) ; + __nlCurrentContext->normalize_rows = NL_TRUE ; + } break ; + default: { + __nl_assert_not_reached ; + } + } +} + +void nlDisable(NLenum pname) { + switch(pname) { + case NL_NORMALIZE_ROWS: { + __nl_assert(__nlCurrentContext->state != __NL_STATE_ROW) ; + __nlCurrentContext->normalize_rows = NL_FALSE ; + } break ; + default: { + __nl_assert_not_reached ; + } + } +} + +NLboolean nlIsEnabled(NLenum pname) { + switch(pname) { + case NL_NORMALIZE_ROWS: { + return __nlCurrentContext->normalize_rows ; + } break ; + default: { + __nl_assert_not_reached ; + } + } + return NL_FALSE ; +} + +/************************************************************************************/ +/* Get/Set Lock/Unlock variables */ + +void nlSetVariable(NLuint index, NLfloat value) { + __nlCheckState(__NL_STATE_SYSTEM) ; + __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ; + __nlCurrentContext->variable[index].value = value ; +} + +NLfloat nlGetVariable(NLuint index) { + __nl_assert(__nlCurrentContext->state != __NL_STATE_INITIAL) ; + __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ; + return __nlCurrentContext->variable[index].value ; +} + +void nlLockVariable(NLuint index) { + __nlCheckState(__NL_STATE_SYSTEM) ; + __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ; + __nlCurrentContext->variable[index].locked = NL_TRUE ; +} + +void nlUnlockVariable(NLuint index) { + __nlCheckState(__NL_STATE_SYSTEM) ; + __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ; + __nlCurrentContext->variable[index].locked = NL_FALSE ; +} + +NLboolean nlVariableIsLocked(NLuint index) { + __nl_assert(__nlCurrentContext->state != __NL_STATE_INITIAL) ; + __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ; + return __nlCurrentContext->variable[index].locked ; +} + +/************************************************************************************/ +/* System construction */ + +void __nlVariablesToVector() { + NLuint i ; + __nl_assert(__nlCurrentContext->alloc_x) ; + __nl_assert(__nlCurrentContext->alloc_variable) ; + for(i=0; i<__nlCurrentContext->nb_variables; i++) { + __NLVariable* v = &(__nlCurrentContext->variable[i]) ; + if(!v->locked) { + __nl_assert(v->index < __nlCurrentContext->n) ; + __nlCurrentContext->x[v->index] = v->value ; + } + } +} + +void __nlVectorToVariables() { + NLuint i ; + __nl_assert(__nlCurrentContext->alloc_x) ; + __nl_assert(__nlCurrentContext->alloc_variable) ; + for(i=0; i<__nlCurrentContext->nb_variables; i++) { + __NLVariable* v = &(__nlCurrentContext->variable[i]) ; + if(!v->locked) { + __nl_assert(v->index < __nlCurrentContext->n) ; + v->value = __nlCurrentContext->x[v->index] ; + } + } +} + + +void __nlBeginSystem() { + __nlTransition(__NL_STATE_INITIAL, __NL_STATE_SYSTEM) ; + __nl_assert(__nlCurrentContext->nb_variables > 0) ; + __nlCurrentContext->variable = __NL_NEW_ARRAY( + __NLVariable, __nlCurrentContext->nb_variables + ) ; + __nlCurrentContext->alloc_variable = NL_TRUE ; +} + +void __nlEndSystem() { + __nlTransition(__NL_STATE_MATRIX_CONSTRUCTED, __NL_STATE_SYSTEM_CONSTRUCTED) ; +} + +void __nlBeginMatrix() { + NLuint i ; + NLuint n = 0 ; + NLenum storage = __NL_ROWS ; + + __nlTransition(__NL_STATE_SYSTEM, __NL_STATE_MATRIX) ; + + for(i=0; i<__nlCurrentContext->nb_variables; i++) { + if(!__nlCurrentContext->variable[i].locked) { + __nlCurrentContext->variable[i].index = n ; + n++ ; + } else { + __nlCurrentContext->variable[i].index = ~0 ; + } + } + + __nlCurrentContext->n = n ; + + /* a least squares problem results in a symmetric matrix */ + if(__nlCurrentContext->least_squares) { + __nlCurrentContext->symmetric = NL_TRUE ; + } + + if(__nlCurrentContext->symmetric) { + storage = (storage | __NL_SYMMETRIC) ; + } + + /* SuperLU storage does not support symmetric storage */ + storage = (storage & ~__NL_SYMMETRIC) ; + + __nlSparseMatrixConstruct(&__nlCurrentContext->M, n, n, storage) ; + __nlCurrentContext->alloc_M = NL_TRUE ; + + __nlCurrentContext->x = __NL_NEW_ARRAY(NLfloat, n) ; + __nlCurrentContext->alloc_x = NL_TRUE ; + + __nlCurrentContext->b = __NL_NEW_ARRAY(NLfloat, n) ; + __nlCurrentContext->alloc_b = NL_TRUE ; + + __nlVariablesToVector() ; + + __nlRowColumnConstruct(&__nlCurrentContext->af) ; + __nlCurrentContext->alloc_af = NL_TRUE ; + __nlRowColumnConstruct(&__nlCurrentContext->al) ; + __nlCurrentContext->alloc_al = NL_TRUE ; + __nlRowColumnConstruct(&__nlCurrentContext->xl) ; + __nlCurrentContext->alloc_xl = NL_TRUE ; + + __nlCurrentContext->current_row = 0 ; +} + +void __nlEndMatrix() { + __nlTransition(__NL_STATE_MATRIX, __NL_STATE_MATRIX_CONSTRUCTED) ; + + __nlRowColumnDestroy(&__nlCurrentContext->af) ; + __nlCurrentContext->alloc_af = NL_FALSE ; + __nlRowColumnDestroy(&__nlCurrentContext->al) ; + __nlCurrentContext->alloc_al = NL_FALSE ; + __nlRowColumnDestroy(&__nlCurrentContext->xl) ; + __nlCurrentContext->alloc_al = NL_FALSE ; + + if(!__nlCurrentContext->least_squares) { + __nl_assert( + __nlCurrentContext->current_row == + __nlCurrentContext->n + ) ; + } +} + +void __nlBeginRow() { + __nlTransition(__NL_STATE_MATRIX, __NL_STATE_ROW) ; + __nlRowColumnZero(&__nlCurrentContext->af) ; + __nlRowColumnZero(&__nlCurrentContext->al) ; + __nlRowColumnZero(&__nlCurrentContext->xl) ; +} + +void __nlScaleRow(NLfloat s) { + __NLRowColumn* af = &__nlCurrentContext->af ; + __NLRowColumn* al = &__nlCurrentContext->al ; + NLuint nf = af->size ; + NLuint nl = al->size ; + NLuint i ; + for(i=0; icoeff[i].value *= s ; + } + for(i=0; icoeff[i].value *= s ; + } + __nlCurrentContext->right_hand_side *= s ; +} + +void __nlNormalizeRow(NLfloat weight) { + __NLRowColumn* af = &__nlCurrentContext->af ; + __NLRowColumn* al = &__nlCurrentContext->al ; + NLuint nf = af->size ; + NLuint nl = al->size ; + NLuint i ; + NLfloat norm = 0.0 ; + for(i=0; icoeff[i].value * af->coeff[i].value ; + } + for(i=0; icoeff[i].value * al->coeff[i].value ; + } + norm = sqrt(norm) ; + __nlScaleRow(weight / norm) ; +} + +void __nlEndRow() { + __NLRowColumn* af = &__nlCurrentContext->af ; + __NLRowColumn* al = &__nlCurrentContext->al ; + __NLRowColumn* xl = &__nlCurrentContext->xl ; + __NLSparseMatrix* M = &__nlCurrentContext->M ; + NLfloat* b = __nlCurrentContext->b ; + NLuint nf = af->size ; + NLuint nl = al->size ; + NLuint current_row = __nlCurrentContext->current_row ; + NLuint i ; + NLuint j ; + NLfloat S ; + __nlTransition(__NL_STATE_ROW, __NL_STATE_MATRIX) ; + + if(__nlCurrentContext->normalize_rows) { + __nlNormalizeRow(__nlCurrentContext->row_scaling) ; + } else { + __nlScaleRow(__nlCurrentContext->row_scaling) ; + } + + if(__nlCurrentContext->least_squares) { + for(i=0; icoeff[i].index, af->coeff[j].index, + af->coeff[i].value * af->coeff[j].value + ) ; + } + } + S = -__nlCurrentContext->right_hand_side ; + for(j=0; jcoeff[j].value * xl->coeff[j].value ; + } + for(i=0; icoeff[i].index ] -= af->coeff[i].value * S ; + } + } else { + for(i=0; icoeff[i].index, af->coeff[i].value + ) ; + } + b[current_row] = -__nlCurrentContext->right_hand_side ; + for(i=0; icoeff[i].value * xl->coeff[i].value ; + } + } + __nlCurrentContext->current_row++ ; + __nlCurrentContext->right_hand_side = 0.0 ; + __nlCurrentContext->row_scaling = 1.0 ; +} + +void nlCoefficient(NLuint index, NLfloat value) { + __NLVariable* v; + unsigned int zero= 0; + __nlCheckState(__NL_STATE_ROW) ; + __nl_range_assert(index, zero, __nlCurrentContext->nb_variables - 1) ; + v = &(__nlCurrentContext->variable[index]) ; + if(v->locked) { + __nlRowColumnAppend(&(__nlCurrentContext->al), 0, value) ; + __nlRowColumnAppend(&(__nlCurrentContext->xl), 0, v->value) ; + } else { + __nlRowColumnAppend(&(__nlCurrentContext->af), v->index, value) ; + } +} + +void nlBegin(NLenum prim) { + switch(prim) { + case NL_SYSTEM: { + __nlBeginSystem() ; + } break ; + case NL_MATRIX: { + __nlBeginMatrix() ; + } break ; + case NL_ROW: { + __nlBeginRow() ; + } break ; + default: { + __nl_assert_not_reached ; + } + } +} + +void nlEnd(NLenum prim) { + switch(prim) { + case NL_SYSTEM: { + __nlEndSystem() ; + } break ; + case NL_MATRIX: { + __nlEndMatrix() ; + } break ; + case NL_ROW: { + __nlEndRow() ; + } break ; + default: { + __nl_assert_not_reached ; + } + } +} + +/************************************************************************/ +/* SuperLU wrapper */ + +/* Note: SuperLU is difficult to call, but it is worth it. */ +/* Here is a driver inspired by A. Sheffer's "cow flattener". */ +static NLboolean __nlSolve_SUPERLU( NLboolean do_perm) { + + /* OpenNL Context */ + __NLSparseMatrix* M = &(__nlCurrentContext->M) ; + NLfloat* b = __nlCurrentContext->b ; + NLfloat* x = __nlCurrentContext->x ; + + /* Compressed Row Storage matrix representation */ + NLuint n = __nlCurrentContext->n ; + NLuint nnz = __nlSparseMatrixNNZ(M) ; /* Number of Non-Zero coeffs */ + NLint* xa = __NL_NEW_ARRAY(NLint, n+1) ; + NLfloat* rhs = __NL_NEW_ARRAY(NLfloat, n) ; + NLfloat* a = __NL_NEW_ARRAY(NLfloat, nnz) ; + NLint* asub = __NL_NEW_ARRAY(NLint, nnz) ; + + /* Permutation vector */ + NLint* perm_r = __NL_NEW_ARRAY(NLint, n) ; + NLint* perm = __NL_NEW_ARRAY(NLint, n) ; + + /* SuperLU variables */ + SuperMatrix A, B ; /* System */ + SuperMatrix L, U ; /* Inverse of A */ + NLint info ; /* status code */ + DNformat *vals = NULL ; /* access to result */ + float *rvals = NULL ; /* access to result */ + + /* SuperLU options and stats */ + superlu_options_t options ; + SuperLUStat_t stat ; + + + /* Temporary variables */ + __NLRowColumn* Ri = NULL ; + NLuint i,jj,count ; + + __nl_assert(!(M->storage & __NL_SYMMETRIC)) ; + __nl_assert(M->storage & __NL_ROWS) ; + __nl_assert(M->m == M->n) ; + + + /* + * Step 1: convert matrix M into SuperLU compressed column + * representation. + * ------------------------------------------------------- + */ + + count = 0 ; + for(i=0; irow[i]) ; + xa[i] = count ; + for(jj=0; jjsize; jj++) { + a[count] = Ri->coeff[jj].value ; + asub[count] = Ri->coeff[jj].index ; + count++ ; + } + } + xa[n] = nnz ; + + /* Save memory for SuperLU */ + __nlSparseMatrixClear(M) ; + + + /* + * Rem: symmetric storage does not seem to work with + * SuperLU ... (->deactivated in main SLS::Solver driver) + */ + sCreate_CompCol_Matrix( + &A, n, n, nnz, a, asub, xa, + SLU_NR, /* Row_wise, no supernode */ + SLU_S, /* floats */ + SLU_GE /* general storage */ + ); + + /* Step 2: create vector */ + sCreate_Dense_Matrix( + &B, n, 1, b, n, + SLU_DN, /* Fortran-type column-wise storage */ + SLU_S, /* floats */ + SLU_GE /* general */ + ); + + + /* Step 3: get permutation matrix + * ------------------------------ + * com_perm: 0 -> no re-ordering + * 1 -> re-ordering for A^t.A + * 2 -> re-ordering for A^t+A + * 3 -> approximate minimum degree ordering + */ + get_perm_c(do_perm ? 3 : 0, &A, perm) ; + + /* Step 4: call SuperLU main routine + * --------------------------------- + */ + + set_default_options(&options) ; + options.ColPerm = MY_PERMC ; + StatInit(&stat) ; + + sgssv(&options, &A, perm, perm_r, &L, &U, &B, &stat, &info); + + /* Step 5: get the solution + * ------------------------ + * Fortran-type column-wise storage + */ + vals = (DNformat*)B.Store; + rvals = (float*)(vals->nzval); + if(info == 0) { + for(i = 0; i < n; i++){ + x[i] = rvals[i]; + } + } + + /* Step 6: cleanup + * --------------- + */ + + /* + * For these two ones, only the "store" structure + * needs to be deallocated (the arrays have been allocated + * by us). + */ + Destroy_SuperMatrix_Store(&A) ; + Destroy_SuperMatrix_Store(&B) ; + + + /* + * These ones need to be fully deallocated (they have been + * allocated by SuperLU). + */ + Destroy_SuperNode_Matrix(&L); + Destroy_CompCol_Matrix(&U); + + __NL_DELETE_ARRAY(xa) ; + __NL_DELETE_ARRAY(rhs) ; + __NL_DELETE_ARRAY(a) ; + __NL_DELETE_ARRAY(asub) ; + __NL_DELETE_ARRAY(perm_r) ; + __NL_DELETE_ARRAY(perm) ; + + return (info == 0) ; +} + + +/************************************************************************/ +/* nlSolve() driver routine */ + +NLboolean nlSolve() { + NLboolean result = NL_TRUE ; + + __nlCheckState(__NL_STATE_SYSTEM_CONSTRUCTED) ; + result = __nlSolve_SUPERLU(NL_TRUE) ; + + __nlVectorToVariables() ; + __nlTransition(__NL_STATE_SYSTEM_CONSTRUCTED, __NL_STATE_SOLVED) ; + + return result ; +} + diff --git a/intern/opennl/superlu/Cnames.h b/intern/opennl/superlu/Cnames.h new file mode 100644 index 00000000000..35ff7b0b665 --- /dev/null +++ b/intern/opennl/superlu/Cnames.h @@ -0,0 +1,281 @@ +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 1, 1997 + * + */ +#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ +#define __SUPERLU_CNAMES + +/* We want this flag, safer than putting in build system */ +#define Add_ + +/* + * These macros define how C routines will be called. ADD_ assumes that + * they will be called by fortran, which expects C routines to have an + * underscore postfixed to the name (Suns, and the Intel expect this). + * NOCHANGE indicates that fortran will be calling, and that it expects + * the name called by fortran to be identical to that compiled by the C + * (RS6K's do this). UPCASE says it expects C routines called by fortran + * to be in all upcase (CRAY wants this). + */ + +#define ADD_ 0 +#define ADD__ 1 +#define NOCHANGE 2 +#define UPCASE 3 +#define C_CALL 4 + +#ifdef UpCase +#define F77_CALL_C UPCASE +#endif + +#ifdef NoChange +#define F77_CALL_C NOCHANGE +#endif + +#ifdef Add_ +#define F77_CALL_C ADD_ +#endif + +#ifdef Add__ +#define F77_CALL_C ADD__ +#endif + +/* Default */ +#ifndef F77_CALL_C +#define F77_CALL_C ADD_ +#endif + + +#if (F77_CALL_C == ADD_) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * No redefinition necessary to have following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm_(...) + * + * This is the default. + */ + +#endif + +#if (F77_CALL_C == ADD__) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * for following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm__(...) + */ +#define sasum_ sasum__ +#define isamax_ isamax__ +#define scopy_ scopy__ +#define sscal_ sscal__ +#define sger_ sger__ +#define snrm2_ snrm2__ +#define ssymv_ ssymv__ +#define sdot_ sdot__ +#define saxpy_ saxpy__ +#define ssyr2_ ssyr2__ +#define srot_ srot__ +#define sgemv_ sgemv__ +#define strsv_ strsv__ +#define sgemm_ sgemm__ +#define strsm_ strsm__ + +#define dasum_ dasum__ +#define idamax_ idamax__ +#define dcopy_ dcopy__ +#define dscal_ dscal__ +#define dger_ dger__ +#define dnrm2_ dnrm2__ +#define dsymv_ dsymv__ +#define ddot_ ddot__ +#define daxpy_ daxpy__ +#define dsyr2_ dsyr2__ +#define drot_ drot__ +#define dgemv_ dgemv__ +#define dtrsv_ dtrsv__ +#define dgemm_ dgemm__ +#define dtrsm_ dtrsm__ + +#define scasum_ scasum__ +#define icamax_ icamax__ +#define ccopy_ ccopy__ +#define cscal_ cscal__ +#define scnrm2_ scnrm2__ +#define caxpy_ caxpy__ +#define cgemv_ cgemv__ +#define ctrsv_ ctrsv__ +#define cgemm_ cgemm__ +#define ctrsm_ ctrsm__ +#define cgerc_ cgerc__ +#define chemv_ chemv__ +#define cher2_ cher2__ + +#define dzasum_ dzasum__ +#define izamax_ izamax__ +#define zcopy_ zcopy__ +#define zscal_ zscal__ +#define dznrm2_ dznrm2__ +#define zaxpy_ zaxpy__ +#define zgemv_ zgemv__ +#define ztrsv_ ztrsv__ +#define zgemm_ zgemm__ +#define ztrsm_ ztrsm__ +#define zgerc_ zgerc__ +#define zhemv_ zhemv__ +#define zher2_ zher2__ + +#define c_bridge_dgssv_ c_bridge_dgssv__ +#define c_fortran_dgssv_ c_fortran_dgssv__ +#endif + +#if (F77_CALL_C == UPCASE) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void DGEMM(...) + */ +#define sasum_ SASUM +#define isamax_ ISAMAX +#define scopy_ SCOPY +#define sscal_ SSCAL +#define sger_ SGER +#define snrm2_ SNRM2 +#define ssymv_ SSYMV +#define sdot_ SDOT +#define saxpy_ SAXPY +#define ssyr2_ SSYR2 +#define srot_ SROT +#define sgemv_ SGEMV +#define strsv_ STRSV +#define sgemm_ SGEMM +#define strsm_ STRSM + +#define dasum_ SASUM +#define idamax_ ISAMAX +#define dcopy_ SCOPY +#define dscal_ SSCAL +#define dger_ SGER +#define dnrm2_ SNRM2 +#define dsymv_ SSYMV +#define ddot_ SDOT +#define daxpy_ SAXPY +#define dsyr2_ SSYR2 +#define drot_ SROT +#define dgemv_ SGEMV +#define dtrsv_ STRSV +#define dgemm_ SGEMM +#define dtrsm_ STRSM + +#define scasum_ SCASUM +#define icamax_ ICAMAX +#define ccopy_ CCOPY +#define cscal_ CSCAL +#define scnrm2_ SCNRM2 +#define caxpy_ CAXPY +#define cgemv_ CGEMV +#define ctrsv_ CTRSV +#define cgemm_ CGEMM +#define ctrsm_ CTRSM +#define cgerc_ CGERC +#define chemv_ CHEMV +#define cher2_ CHER2 + +#define dzasum_ SCASUM +#define izamax_ ICAMAX +#define zcopy_ CCOPY +#define zscal_ CSCAL +#define dznrm2_ SCNRM2 +#define zaxpy_ CAXPY +#define zgemv_ CGEMV +#define ztrsv_ CTRSV +#define zgemm_ CGEMM +#define ztrsm_ CTRSM +#define zgerc_ CGERC +#define zhemv_ CHEMV +#define zher2_ CHER2 + +#define c_bridge_dgssv_ C_BRIDGE_DGSSV +#define c_fortran_dgssv_ C_FORTRAN_DGSSV +#endif + +#if (F77_CALL_C == NOCHANGE) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * for following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm(...) + */ +#define sasum_ sasum +#define isamax_ isamax +#define scopy_ scopy +#define sscal_ sscal +#define sger_ sger +#define snrm2_ snrm2 +#define ssymv_ ssymv +#define sdot_ sdot +#define saxpy_ saxpy +#define ssyr2_ ssyr2 +#define srot_ srot +#define sgemv_ sgemv +#define strsv_ strsv +#define sgemm_ sgemm +#define strsm_ strsm + +#define dasum_ dasum +#define idamax_ idamax +#define dcopy_ dcopy +#define dscal_ dscal +#define dger_ dger +#define dnrm2_ dnrm2 +#define dsymv_ dsymv +#define ddot_ ddot +#define daxpy_ daxpy +#define dsyr2_ dsyr2 +#define drot_ drot +#define dgemv_ dgemv +#define dtrsv_ dtrsv +#define dgemm_ dgemm +#define dtrsm_ dtrsm + +#define scasum_ scasum +#define icamax_ icamax +#define ccopy_ ccopy +#define cscal_ cscal +#define scnrm2_ scnrm2 +#define caxpy_ caxpy +#define cgemv_ cgemv +#define ctrsv_ ctrsv +#define cgemm_ cgemm +#define ctrsm_ ctrsm +#define cgerc_ cgerc +#define chemv_ chemv +#define cher2_ cher2 + +#define dzasum_ dzasum +#define izamax_ izamax +#define zcopy_ zcopy +#define zscal_ zscal +#define dznrm2_ dznrm2 +#define zaxpy_ zaxpy +#define zgemv_ zgemv +#define ztrsv_ ztrsv +#define zgemm_ zgemm +#define ztrsm_ ztrsm +#define zgerc_ zgerc +#define zhemv_ zhemv +#define zher2_ zher2 + +#define c_bridge_dgssv_ c_bridge_dgssv +#define c_fortran_dgssv_ c_fortran_dgssv +#endif + +#endif /* __SUPERLU_CNAMES */ diff --git a/intern/opennl/superlu/Makefile b/intern/opennl/superlu/Makefile new file mode 100644 index 00000000000..942ceebc79c --- /dev/null +++ b/intern/opennl/superlu/Makefile @@ -0,0 +1,40 @@ +# +# $Id$ +# +# ***** BEGIN GPL/BL DUAL LICENSE BLOCK ***** +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. The Blender +# Foundation also sells licenses for use in proprietary software under +# the Blender License. See http://www.blender.org/BL/ for information +# about this. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# The Original Code is Copyright (C) 2001-2002 by NaN Holding BV. +# All rights reserved. +# +# The Original Code is: all of this file. +# +# Contributor(s): none yet. +# +# ***** END GPL/BL DUAL LICENSE BLOCK ***** +# opennl intern Makefile +# + +LIBNAME = superlu +DIR = $(OCGDIR)/intern/$(LIBNAME) + +include nan_compile.mk + +CCFLAGS += $(NAN_LEVEL_2_CPP_WARNINGS) + diff --git a/intern/opennl/superlu/colamd.c b/intern/opennl/superlu/colamd.c new file mode 100644 index 00000000000..b60718f9938 --- /dev/null +++ b/intern/opennl/superlu/colamd.c @@ -0,0 +1,2583 @@ +/* ========================================================================== */ +/* === colamd - a sparse matrix column ordering algorithm =================== */ +/* ========================================================================== */ + +/* + colamd: An approximate minimum degree column ordering algorithm. + + Purpose: + + Colamd computes a permutation Q such that the Cholesky factorization of + (AQ)'(AQ) has less fill-in and requires fewer floating point operations + than A'A. This also provides a good ordering for sparse partial + pivoting methods, P(AQ) = LU, where Q is computed prior to numerical + factorization, and P is computed during numerical factorization via + conventional partial pivoting with row interchanges. Colamd is the + column ordering method used in SuperLU, part of the ScaLAPACK library. + It is also available as user-contributed software for Matlab 5.2, + available from MathWorks, Inc. (http://www.mathworks.com). This + routine can be used in place of COLMMD in Matlab. By default, the \ + and / operators in Matlab perform a column ordering (using COLMMD) + prior to LU factorization using sparse partial pivoting, in the + built-in Matlab LU(A) routine. + + Authors: + + The authors of the code itself are Stefan I. Larimore and Timothy A. + Davis (davis@cise.ufl.edu), University of Florida. The algorithm was + developed in collaboration with John Gilbert, Xerox PARC, and Esmond + Ng, Oak Ridge National Laboratory. + + Date: + + August 3, 1998. Version 1.0. + + Acknowledgements: + + This work was supported by the National Science Foundation, under + grants DMS-9504974 and DMS-9803599. + + Notice: + + Copyright (c) 1998 by the University of Florida. All Rights Reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + User documentation of any code that uses this code must cite the + Authors, the Copyright, and "Used by permission." If this code is + accessible from within Matlab, then typing "help colamd" or "colamd" + (with no arguments) must cite the Authors. Permission to modify the + code and to distribute modified code is granted, provided the above + notices are retained, and a notice that the code was modified is + included with the above copyright notice. You must also retain the + Availability information below, of the original version. + + This software is provided free of charge. + + Availability: + + This file is located at + + http://www.cise.ufl.edu/~davis/colamd/colamd.c + + The colamd.h file is required, located in the same directory. + The colamdmex.c file provides a Matlab interface for colamd. + The symamdmex.c file provides a Matlab interface for symamd, which is + a symmetric ordering based on this code, colamd.c. All codes are + purely ANSI C compliant (they use no Unix-specific routines, include + files, etc.). +*/ + +/* ========================================================================== */ +/* === Description of user-callable routines ================================ */ +/* ========================================================================== */ + +/* + Each user-callable routine (declared as PUBLIC) is briefly described below. + Refer to the comments preceding each routine for more details. + + ---------------------------------------------------------------------------- + colamd_recommended: + ---------------------------------------------------------------------------- + + Usage: + + Alen = colamd_recommended (nnz, n_row, n_col) ; + + Purpose: + + Returns recommended value of Alen for use by colamd. Returns -1 + if any input argument is negative. + + Arguments: + + int nnz ; Number of nonzeros in the matrix A. This must + be the same value as p [n_col] in the call to + colamd - otherwise you will get a wrong value + of the recommended memory to use. + int n_row ; Number of rows in the matrix A. + int n_col ; Number of columns in the matrix A. + + ---------------------------------------------------------------------------- + colamd_set_defaults: + ---------------------------------------------------------------------------- + + Usage: + + colamd_set_defaults (knobs) ; + + Purpose: + + Sets the default parameters. + + Arguments: + + double knobs [COLAMD_KNOBS] ; Output only. + + Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries + are removed prior to ordering. Columns with more than + (knobs [COLAMD_DENSE_COL] * n_row) entries are removed + prior to ordering, and placed last in the output column + ordering. Default values of these two knobs are both 0.5. + Currently, only knobs [0] and knobs [1] are used, but future + versions may use more knobs. If so, they will be properly set + to their defaults by the future version of colamd_set_defaults, + so that the code that calls colamd will not need to change, + assuming that you either use colamd_set_defaults, or pass a + (double *) NULL pointer as the knobs array to colamd. + + ---------------------------------------------------------------------------- + colamd: + ---------------------------------------------------------------------------- + + Usage: + + colamd (n_row, n_col, Alen, A, p, knobs) ; + + Purpose: + + Computes a column ordering (Q) of A such that P(AQ)=LU or + (AQ)'AQ=LL' have less fill-in and require fewer floating point + operations than factorizing the unpermuted matrix A or A'A, + respectively. + + Arguments: + + int n_row ; + + Number of rows in the matrix A. + Restriction: n_row >= 0. + Colamd returns FALSE if n_row is negative. + + int n_col ; + + Number of columns in the matrix A. + Restriction: n_col >= 0. + Colamd returns FALSE if n_col is negative. + + int Alen ; + + Restriction (see note): + Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS + Colamd returns FALSE if these conditions are not met. + + Note: this restriction makes an modest assumption regarding + the size of the two typedef'd structures, below. We do, + however, guarantee that + Alen >= colamd_recommended (nnz, n_row, n_col) + will be sufficient. + + int A [Alen] ; Input argument, stats on output. + + A is an integer array of size Alen. Alen must be at least as + large as the bare minimum value given above, but this is very + low, and can result in excessive run time. For best + performance, we recommend that Alen be greater than or equal to + colamd_recommended (nnz, n_row, n_col), which adds + nnz/5 to the bare minimum value given above. + + On input, the row indices of the entries in column c of the + matrix are held in A [(p [c]) ... (p [c+1]-1)]. The row indices + in a given column c need not be in ascending order, and + duplicate row indices may be be present. However, colamd will + work a little faster if both of these conditions are met + (Colamd puts the matrix into this format, if it finds that the + the conditions are not met). + + The matrix is 0-based. That is, rows are in the range 0 to + n_row-1, and columns are in the range 0 to n_col-1. Colamd + returns FALSE if any row index is out of range. + + The contents of A are modified during ordering, and are thus + undefined on output with the exception of a few statistics + about the ordering (A [0..COLAMD_STATS-1]): + A [0]: number of dense or empty rows ignored. + A [1]: number of dense or empty columns ignored (and ordered + last in the output permutation p) + A [2]: number of garbage collections performed. + A [3]: 0, if all row indices in each column were in sorted + order, and no duplicates were present. + 1, otherwise (in which case colamd had to do more work) + Note that a row can become "empty" if it contains only + "dense" and/or "empty" columns, and similarly a column can + become "empty" if it only contains "dense" and/or "empty" rows. + Future versions may return more statistics in A, but the usage + of these 4 entries in A will remain unchanged. + + int p [n_col+1] ; Both input and output argument. + + p is an integer array of size n_col+1. On input, it holds the + "pointers" for the column form of the matrix A. Column c of + the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first + entry, p [0], must be zero, and p [c] <= p [c+1] must hold + for all c in the range 0 to n_col-1. The value p [n_col] is + thus the total number of entries in the pattern of the matrix A. + Colamd returns FALSE if these conditions are not met. + + On output, if colamd returns TRUE, the array p holds the column + permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is + the first column index in the new ordering, and p [n_col-1] is + the last. That is, p [k] = j means that column j of A is the + kth pivot column, in AQ, where k is in the range 0 to n_col-1 + (p [0] = j means that column j of A is the first column in AQ). + + If colamd returns FALSE, then no permutation is returned, and + p is undefined on output. + + double knobs [COLAMD_KNOBS] ; Input only. + + See colamd_set_defaults for a description. If the knobs array + is not present (that is, if a (double *) NULL pointer is passed + in its place), then the default values of the parameters are + used instead. + +*/ + + +/* ========================================================================== */ +/* === Include files ======================================================== */ +/* ========================================================================== */ + +/* limits.h: the largest positive integer (INT_MAX) */ +#include + +/* colamd.h: knob array size, stats output size, and global prototypes */ +#include "colamd.h" + +/* ========================================================================== */ +/* === Scaffolding code definitions ======================================== */ +/* ========================================================================== */ + +/* Ensure that debugging is turned off: */ +#ifndef NDEBUG +#define NDEBUG +#endif + +/* assert.h: the assert macro (no debugging if NDEBUG is defined) */ +#include + +/* + Our "scaffolding code" philosophy: In our opinion, well-written library + code should keep its "debugging" code, and just normally have it turned off + by the compiler so as not to interfere with performance. This serves + several purposes: + + (1) assertions act as comments to the reader, telling you what the code + expects at that point. All assertions will always be true (unless + there really is a bug, of course). + + (2) leaving in the scaffolding code assists anyone who would like to modify + the code, or understand the algorithm (by reading the debugging output, + one can get a glimpse into what the code is doing). + + (3) (gasp!) for actually finding bugs. This code has been heavily tested + and "should" be fully functional and bug-free ... but you never know... + + To enable debugging, comment out the "#define NDEBUG" above. The code will + become outrageously slow when debugging is enabled. To control the level of + debugging output, set an environment variable D to 0 (little), 1 (some), + 2, 3, or 4 (lots). +*/ + +/* ========================================================================== */ +/* === Row and Column structures ============================================ */ +/* ========================================================================== */ + +typedef struct ColInfo_struct +{ + int start ; /* index for A of first row in this column, or DEAD */ + /* if column is dead */ + int length ; /* number of rows in this column */ + union + { + int thickness ; /* number of original columns represented by this */ + /* col, if the column is alive */ + int parent ; /* parent in parent tree super-column structure, if */ + /* the column is dead */ + } shared1 ; + union + { + int score ; /* the score used to maintain heap, if col is alive */ + int order ; /* pivot ordering of this column, if col is dead */ + } shared2 ; + union + { + int headhash ; /* head of a hash bucket, if col is at the head of */ + /* a degree list */ + int hash ; /* hash value, if col is not in a degree list */ + int prev ; /* previous column in degree list, if col is in a */ + /* degree list (but not at the head of a degree list) */ + } shared3 ; + union + { + int degree_next ; /* next column, if col is in a degree list */ + int hash_next ; /* next column, if col is in a hash list */ + } shared4 ; + +} ColInfo ; + +typedef struct RowInfo_struct +{ + int start ; /* index for A of first col in this row */ + int length ; /* number of principal columns in this row */ + union + { + int degree ; /* number of principal & non-principal columns in row */ + int p ; /* used as a row pointer in init_rows_cols () */ + } shared1 ; + union + { + int mark ; /* for computing set differences and marking dead rows*/ + int first_column ;/* first column in row (used in garbage collection) */ + } shared2 ; + +} RowInfo ; + +/* ========================================================================== */ +/* === Definitions ========================================================== */ +/* ========================================================================== */ + +#define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) + +#define ONES_COMPLEMENT(r) (-(r)-1) + +#define TRUE (1) +#define FALSE (0) +#define EMPTY (-1) + +/* Row and column status */ +#define ALIVE (0) +#define DEAD (-1) + +/* Column status */ +#define DEAD_PRINCIPAL (-1) +#define DEAD_NON_PRINCIPAL (-2) + +/* Macros for row and column status update and checking. */ +#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) +#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) +#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) +#define COL_IS_DEAD(c) (Col [c].start < ALIVE) +#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) +#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) +#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } +#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } +#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } + +/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */ +#define PUBLIC +#define PRIVATE static + +/* ========================================================================== */ +/* === Prototypes of PRIVATE routines ======================================= */ +/* ========================================================================== */ + +PRIVATE int init_rows_cols +( + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [], + int p [] +) ; + +PRIVATE void init_scoring +( + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [], + int head [], + double knobs [COLAMD_KNOBS], + int *p_n_row2, + int *p_n_col2, + int *p_max_deg +) ; + +PRIVATE int find_ordering +( + int n_row, + int n_col, + int Alen, + RowInfo Row [], + ColInfo Col [], + int A [], + int head [], + int n_col2, + int max_deg, + int pfree +) ; + +PRIVATE void order_children +( + int n_col, + ColInfo Col [], + int p [] +) ; + +PRIVATE void detect_super_cols +( +#ifndef NDEBUG + int n_col, + RowInfo Row [], +#endif + ColInfo Col [], + int A [], + int head [], + int row_start, + int row_length +) ; + +PRIVATE int garbage_collection +( + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [], + int *pfree +) ; + +PRIVATE int clear_mark +( + int n_row, + RowInfo Row [] +) ; + +/* ========================================================================== */ +/* === Debugging definitions ================================================ */ +/* ========================================================================== */ + +#ifndef NDEBUG + +/* === With debugging ======================================================= */ + +/* stdlib.h: for getenv and atoi, to get debugging level from environment */ +#include + +/* stdio.h: for printf (no printing if debugging is turned off) */ +#include + +PRIVATE void debug_deg_lists +( + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int head [], + int min_score, + int should, + int max_deg +) ; + +PRIVATE void debug_mark +( + int n_row, + RowInfo Row [], + int tag_mark, + int max_mark +) ; + +PRIVATE void debug_matrix +( + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [] +) ; + +PRIVATE void debug_structures +( + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [], + int n_col2 +) ; + +/* the following is the *ONLY* global variable in this file, and is only */ +/* present when debugging */ + +PRIVATE int debug_colamd ; /* debug print level */ + +#define DEBUG0(params) { (void) printf params ; } +#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; } +#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; } +#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; } +#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; } + +#else + +/* === No debugging ========================================================= */ + +#define DEBUG0(params) ; +#define DEBUG1(params) ; +#define DEBUG2(params) ; +#define DEBUG3(params) ; +#define DEBUG4(params) ; + +#endif + +/* ========================================================================== */ + + +/* ========================================================================== */ +/* === USER-CALLABLE ROUTINES: ============================================== */ +/* ========================================================================== */ + + +/* ========================================================================== */ +/* === colamd_recommended =================================================== */ +/* ========================================================================== */ + +/* + The colamd_recommended routine returns the suggested size for Alen. This + value has been determined to provide good balance between the number of + garbage collections and the memory requirements for colamd. +*/ + +PUBLIC int colamd_recommended /* returns recommended value of Alen. */ +( + /* === Parameters ======================================================= */ + + int nnz, /* number of nonzeros in A */ + int n_row, /* number of rows in A */ + int n_col /* number of columns in A */ +) +{ + /* === Local variables ================================================== */ + + int minimum ; /* bare minimum requirements */ + int recommended ; /* recommended value of Alen */ + + if (nnz < 0 || n_row < 0 || n_col < 0) + { + /* return -1 if any input argument is corrupted */ + DEBUG0 (("colamd_recommended error!")) ; + DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ; + return (-1) ; + } + + minimum = + 2 * (nnz) /* for A */ + + (((n_col) + 1) * sizeof (ColInfo) / sizeof (int)) /* for Col */ + + (((n_row) + 1) * sizeof (RowInfo) / sizeof (int)) /* for Row */ + + n_col /* minimum elbow room to guarrantee success */ + + COLAMD_STATS ; /* for output statistics */ + + /* recommended is equal to the minumum plus enough memory to keep the */ + /* number garbage collections low */ + recommended = minimum + nnz/5 ; + + return (recommended) ; +} + + +/* ========================================================================== */ +/* === colamd_set_defaults ================================================== */ +/* ========================================================================== */ + +/* + The colamd_set_defaults routine sets the default values of the user- + controllable parameters for colamd: + + knobs [0] rows with knobs[0]*n_col entries or more are removed + prior to ordering. + + knobs [1] columns with knobs[1]*n_row entries or more are removed + prior to ordering, and placed last in the column + permutation. + + knobs [2..19] unused, but future versions might use this +*/ + +PUBLIC void colamd_set_defaults +( + /* === Parameters ======================================================= */ + + double knobs [COLAMD_KNOBS] /* knob array */ +) +{ + /* === Local variables ================================================== */ + + int i ; + + if (!knobs) + { + return ; /* no knobs to initialize */ + } + for (i = 0 ; i < COLAMD_KNOBS ; i++) + { + knobs [i] = 0 ; + } + knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ + knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ +} + + +/* ========================================================================== */ +/* === colamd =============================================================== */ +/* ========================================================================== */ + +/* + The colamd routine computes a column ordering Q of a sparse matrix + A such that the LU factorization P(AQ) = LU remains sparse, where P is + selected via partial pivoting. The routine can also be viewed as + providing a permutation Q such that the Cholesky factorization + (AQ)'(AQ) = LL' remains sparse. + + On input, the nonzero patterns of the columns of A are stored in the + array A, in order 0 to n_col-1. A is held in 0-based form (rows in the + range 0 to n_row-1 and columns in the range 0 to n_col-1). Row indices + for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0, + and thus p [n_col] is the number of entries in A. The matrix is + destroyed on output. The row indices within each column do not have to + be sorted (from small to large row indices), and duplicate row indices + may be present. However, colamd will work a little faster if columns are + sorted and no duplicates are present. Matlab 5.2 always passes the matrix + with sorted columns, and no duplicates. + + The integer array A is of size Alen. Alen must be at least of size + (where nnz is the number of entries in A): + + nnz for the input column form of A + + nnz for a row form of A that colamd generates + + 6*(n_col+1) for a ColInfo Col [0..n_col] array + (this assumes sizeof (ColInfo) is 6 int's). + + 4*(n_row+1) for a RowInfo Row [0..n_row] array + (this assumes sizeof (RowInfo) is 4 int's). + + elbow_room must be at least n_col. We recommend at least + nnz/5 in addition to that. If sufficient, + changes in the elbow room affect the ordering + time only, not the ordering itself. + + COLAMD_STATS for the output statistics + + Colamd returns FALSE is memory is insufficient, or TRUE otherwise. + + On input, the caller must specify: + + n_row the number of rows of A + n_col the number of columns of A + Alen the size of the array A + A [0 ... nnz-1] the row indices, where nnz = p [n_col] + A [nnz ... Alen-1] (need not be initialized by the user) + p [0 ... n_col] the column pointers, p [0] = 0, and p [n_col] + is the number of entries in A. Column c of A + is stored in A [p [c] ... p [c+1]-1]. + knobs [0 ... 19] a set of parameters that control the behavior + of colamd. If knobs is a NULL pointer the + defaults are used. The user-callable + colamd_set_defaults routine sets the default + parameters. See that routine for a description + of the user-controllable parameters. + + If the return value of Colamd is TRUE, then on output: + + p [0 ... n_col-1] the column permutation. p [0] is the first + column index, and p [n_col-1] is the last. + That is, p [k] = j means that column j of A + is the kth column of AQ. + + A is undefined on output (the matrix pattern is + destroyed), except for the following statistics: + + A [0] the number of dense (or empty) rows ignored + A [1] the number of dense (or empty) columms. These + are ordered last, in their natural order. + A [2] the number of garbage collections performed. + If this is excessive, then you would have + gotten your results faster if Alen was larger. + A [3] 0, if all row indices in each column were in + sorted order and no duplicates were present. + 1, if there were unsorted or duplicate row + indices in the input. You would have gotten + your results faster if A [3] was returned as 0. + + If the return value of Colamd is FALSE, then A and p are undefined on + output. +*/ + +PUBLIC int colamd /* returns TRUE if successful */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows in A */ + int n_col, /* number of columns in A */ + int Alen, /* length of A */ + int A [], /* row indices of A */ + int p [], /* pointers to columns in A */ + double knobs [COLAMD_KNOBS] /* parameters (uses defaults if NULL) */ +) +{ + /* === Local variables ================================================== */ + + int i ; /* loop index */ + int nnz ; /* nonzeros in A */ + int Row_size ; /* size of Row [], in integers */ + int Col_size ; /* size of Col [], in integers */ + int elbow_room ; /* remaining free space */ + RowInfo *Row ; /* pointer into A of Row [0..n_row] array */ + ColInfo *Col ; /* pointer into A of Col [0..n_col] array */ + int n_col2 ; /* number of non-dense, non-empty columns */ + int n_row2 ; /* number of non-dense, non-empty rows */ + int ngarbage ; /* number of garbage collections performed */ + int max_deg ; /* maximum row degree */ + double default_knobs [COLAMD_KNOBS] ; /* default knobs knobs array */ + int init_result ; /* return code from initialization */ + +#ifndef NDEBUG + debug_colamd = 0 ; /* no debug printing */ + /* get "D" environment variable, which gives the debug printing level */ + if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ; + DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ; +#endif + + /* === Check the input arguments ======================================== */ + + if (n_row < 0 || n_col < 0 || !A || !p) + { + /* n_row and n_col must be non-negative, A and p must be present */ + DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ; + return (FALSE) ; + } + nnz = p [n_col] ; + if (nnz < 0 || p [0] != 0) + { + /* nnz must be non-negative, and p [0] must be zero */ + DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ; + return (FALSE) ; + } + + /* === If no knobs, set default parameters ============================== */ + + if (!knobs) + { + knobs = default_knobs ; + colamd_set_defaults (knobs) ; + } + + /* === Allocate the Row and Col arrays from array A ===================== */ + + Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ; + Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ; + elbow_room = Alen - (2*nnz + Col_size + Row_size) ; + if (elbow_room < n_col + COLAMD_STATS) + { + /* not enough space in array A to perform the ordering */ + DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ; + return (FALSE) ; + } + Alen = 2*nnz + elbow_room ; + Col = (ColInfo *) &A [Alen] ; + Row = (RowInfo *) &A [Alen + Col_size] ; + + /* === Construct the row and column data structures ===================== */ + + init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ; + if (init_result == -1) + { + /* input matrix is invalid */ + DEBUG0 (("colamd error! matrix invalid\n")) ; + return (FALSE) ; + } + + /* === Initialize scores, kill dense rows/columns ======================= */ + + init_scoring (n_row, n_col, Row, Col, A, p, knobs, + &n_row2, &n_col2, &max_deg) ; + + /* === Order the supercolumns =========================================== */ + + ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p, + n_col2, max_deg, 2*nnz) ; + + /* === Order the non-principal columns ================================== */ + + order_children (n_col, Col, p) ; + + /* === Return statistics in A =========================================== */ + + for (i = 0 ; i < COLAMD_STATS ; i++) + { + A [i] = 0 ; + } + A [COLAMD_DENSE_ROW] = n_row - n_row2 ; + A [COLAMD_DENSE_COL] = n_col - n_col2 ; + A [COLAMD_DEFRAG_COUNT] = ngarbage ; + A [COLAMD_JUMBLED_COLS] = init_result ; + + return (TRUE) ; +} + + +/* ========================================================================== */ +/* === NON-USER-CALLABLE ROUTINES: ========================================== */ +/* ========================================================================== */ + +/* There are no user-callable routines beyond this point in the file */ + + +/* ========================================================================== */ +/* === init_rows_cols ======================================================= */ +/* ========================================================================== */ + +/* + Takes the column form of the matrix in A and creates the row form of the + matrix. Also, row and column attributes are stored in the Col and Row + structs. If the columns are un-sorted or contain duplicate row indices, + this routine will also sort and remove duplicate row indices from the + column form of the matrix. Returns -1 on error, 1 if columns jumbled, + or 0 if columns not jumbled. Not user-callable. +*/ + +PRIVATE int init_rows_cols /* returns status code */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows of A */ + int n_col, /* number of columns of A */ + RowInfo Row [], /* of size n_row+1 */ + ColInfo Col [], /* of size n_col+1 */ + int A [], /* row indices of A, of size Alen */ + int p [] /* pointers to columns in A, of size n_col+1 */ +) +{ + /* === Local variables ================================================== */ + + int col ; /* a column index */ + int row ; /* a row index */ + int *cp ; /* a column pointer */ + int *cp_end ; /* a pointer to the end of a column */ + int *rp ; /* a row pointer */ + int *rp_end ; /* a pointer to the end of a row */ + int last_start ; /* start index of previous column in A */ + int start ; /* start index of column in A */ + int last_row ; /* previous row */ + int jumbled_columns ; /* indicates if columns are jumbled */ + + /* === Initialize columns, and check column pointers ==================== */ + + last_start = 0 ; + for (col = 0 ; col < n_col ; col++) + { + start = p [col] ; + if (start < last_start) + { + /* column pointers must be non-decreasing */ + DEBUG0 (("colamd error! last p %d p [col] %d\n",last_start,start)); + return (-1) ; + } + Col [col].start = start ; + Col [col].length = p [col+1] - start ; + Col [col].shared1.thickness = 1 ; + Col [col].shared2.score = 0 ; + Col [col].shared3.prev = EMPTY ; + Col [col].shared4.degree_next = EMPTY ; + last_start = start ; + } + /* must check the end pointer for last column */ + if (p [n_col] < last_start) + { + /* column pointers must be non-decreasing */ + DEBUG0 (("colamd error! last p %d p [n_col] %d\n",p[col],last_start)) ; + return (-1) ; + } + + /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ + + /* === Scan columns, compute row degrees, and check row indices ========= */ + + jumbled_columns = FALSE ; + + for (row = 0 ; row < n_row ; row++) + { + Row [row].length = 0 ; + Row [row].shared2.mark = -1 ; + } + + for (col = 0 ; col < n_col ; col++) + { + last_row = -1 ; + + cp = &A [p [col]] ; + cp_end = &A [p [col+1]] ; + + while (cp < cp_end) + { + row = *cp++ ; + + /* make sure row indices within range */ + if (row < 0 || row >= n_row) + { + DEBUG0 (("colamd error! col %d row %d last_row %d\n", + col, row, last_row)) ; + return (-1) ; + } + else if (row <= last_row) + { + /* row indices are not sorted or repeated, thus cols */ + /* are jumbled */ + jumbled_columns = TRUE ; + } + /* prevent repeated row from being counted */ + if (Row [row].shared2.mark != col) + { + Row [row].length++ ; + Row [row].shared2.mark = col ; + last_row = row ; + } + else + { + /* this is a repeated entry in the column, */ + /* it will be removed */ + Col [col].length-- ; + } + } + } + + /* === Compute row pointers ============================================= */ + + /* row form of the matrix starts directly after the column */ + /* form of matrix in A */ + Row [0].start = p [n_col] ; + Row [0].shared1.p = Row [0].start ; + Row [0].shared2.mark = -1 ; + for (row = 1 ; row < n_row ; row++) + { + Row [row].start = Row [row-1].start + Row [row-1].length ; + Row [row].shared1.p = Row [row].start ; + Row [row].shared2.mark = -1 ; + } + + /* === Create row form ================================================== */ + + if (jumbled_columns) + { + /* if cols jumbled, watch for repeated row indices */ + for (col = 0 ; col < n_col ; col++) + { + cp = &A [p [col]] ; + cp_end = &A [p [col+1]] ; + while (cp < cp_end) + { + row = *cp++ ; + if (Row [row].shared2.mark != col) + { + A [(Row [row].shared1.p)++] = col ; + Row [row].shared2.mark = col ; + } + } + } + } + else + { + /* if cols not jumbled, we don't need the mark (this is faster) */ + for (col = 0 ; col < n_col ; col++) + { + cp = &A [p [col]] ; + cp_end = &A [p [col+1]] ; + while (cp < cp_end) + { + A [(Row [*cp++].shared1.p)++] = col ; + } + } + } + + /* === Clear the row marks and set row degrees ========================== */ + + for (row = 0 ; row < n_row ; row++) + { + Row [row].shared2.mark = 0 ; + Row [row].shared1.degree = Row [row].length ; + } + + /* === See if we need to re-create columns ============================== */ + + if (jumbled_columns) + { + +#ifndef NDEBUG + /* make sure column lengths are correct */ + for (col = 0 ; col < n_col ; col++) + { + p [col] = Col [col].length ; + } + for (row = 0 ; row < n_row ; row++) + { + rp = &A [Row [row].start] ; + rp_end = rp + Row [row].length ; + while (rp < rp_end) + { + p [*rp++]-- ; + } + } + for (col = 0 ; col < n_col ; col++) + { + assert (p [col] == 0) ; + } + /* now p is all zero (different than when debugging is turned off) */ +#endif + + /* === Compute col pointers ========================================= */ + + /* col form of the matrix starts at A [0]. */ + /* Note, we may have a gap between the col form and the row */ + /* form if there were duplicate entries, if so, it will be */ + /* removed upon the first garbage collection */ + Col [0].start = 0 ; + p [0] = Col [0].start ; + for (col = 1 ; col < n_col ; col++) + { + /* note that the lengths here are for pruned columns, i.e. */ + /* no duplicate row indices will exist for these columns */ + Col [col].start = Col [col-1].start + Col [col-1].length ; + p [col] = Col [col].start ; + } + + /* === Re-create col form =========================================== */ + + for (row = 0 ; row < n_row ; row++) + { + rp = &A [Row [row].start] ; + rp_end = rp + Row [row].length ; + while (rp < rp_end) + { + A [(p [*rp++])++] = row ; + } + } + return (1) ; + } + else + { + /* no columns jumbled (this is faster) */ + return (0) ; + } +} + + +/* ========================================================================== */ +/* === init_scoring ========================================================= */ +/* ========================================================================== */ + +/* + Kills dense or empty columns and rows, calculates an initial score for + each column, and places all columns in the degree lists. Not user-callable. +*/ + +PRIVATE void init_scoring +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows of A */ + int n_col, /* number of columns of A */ + RowInfo Row [], /* of size n_row+1 */ + ColInfo Col [], /* of size n_col+1 */ + int A [], /* column form and row form of A */ + int head [], /* of size n_col+1 */ + double knobs [COLAMD_KNOBS],/* parameters */ + int *p_n_row2, /* number of non-dense, non-empty rows */ + int *p_n_col2, /* number of non-dense, non-empty columns */ + int *p_max_deg /* maximum row degree */ +) +{ + /* === Local variables ================================================== */ + + int c ; /* a column index */ + int r, row ; /* a row index */ + int *cp ; /* a column pointer */ + int deg ; /* degree (# entries) of a row or column */ + int *cp_end ; /* a pointer to the end of a column */ + int *new_cp ; /* new column pointer */ + int col_length ; /* length of pruned column */ + int score ; /* current column score */ + int n_col2 ; /* number of non-dense, non-empty columns */ + int n_row2 ; /* number of non-dense, non-empty rows */ + int dense_row_count ; /* remove rows with more entries than this */ + int dense_col_count ; /* remove cols with more entries than this */ + int min_score ; /* smallest column score */ + int max_deg ; /* maximum row degree */ + int next_col ; /* Used to add to degree list.*/ +#ifndef NDEBUG + int debug_count ; /* debug only. */ +#endif + + /* === Extract knobs ==================================================== */ + + dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; + dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; + DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ; + max_deg = 0 ; + n_col2 = n_col ; + n_row2 = n_row ; + + /* === Kill empty columns =============================================== */ + + /* Put the empty columns at the end in their natural, so that LU */ + /* factorization can proceed as far as possible. */ + for (c = n_col-1 ; c >= 0 ; c--) + { + deg = Col [c].length ; + if (deg == 0) + { + /* this is a empty column, kill and order it last */ + Col [c].shared2.order = --n_col2 ; + KILL_PRINCIPAL_COL (c) ; + } + } + DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ; + + /* === Kill dense columns =============================================== */ + + /* Put the dense columns at the end, in their natural order */ + for (c = n_col-1 ; c >= 0 ; c--) + { + /* skip any dead columns */ + if (COL_IS_DEAD (c)) + { + continue ; + } + deg = Col [c].length ; + if (deg > dense_col_count) + { + /* this is a dense column, kill and order it last */ + Col [c].shared2.order = --n_col2 ; + /* decrement the row degrees */ + cp = &A [Col [c].start] ; + cp_end = cp + Col [c].length ; + while (cp < cp_end) + { + Row [*cp++].shared1.degree-- ; + } + KILL_PRINCIPAL_COL (c) ; + } + } + DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ; + + /* === Kill dense and empty rows ======================================== */ + + for (r = 0 ; r < n_row ; r++) + { + deg = Row [r].shared1.degree ; + assert (deg >= 0 && deg <= n_col) ; + if (deg > dense_row_count || deg == 0) + { + /* kill a dense or empty row */ + KILL_ROW (r) ; + --n_row2 ; + } + else + { + /* keep track of max degree of remaining rows */ + max_deg = MAX (max_deg, deg) ; + } + } + DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ; + + /* === Compute initial column scores ==================================== */ + + /* At this point the row degrees are accurate. They reflect the number */ + /* of "live" (non-dense) columns in each row. No empty rows exist. */ + /* Some "live" columns may contain only dead rows, however. These are */ + /* pruned in the code below. */ + + /* now find the initial matlab score for each column */ + for (c = n_col-1 ; c >= 0 ; c--) + { + /* skip dead column */ + if (COL_IS_DEAD (c)) + { + continue ; + } + score = 0 ; + cp = &A [Col [c].start] ; + new_cp = cp ; + cp_end = cp + Col [c].length ; + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + /* skip if dead */ + if (ROW_IS_DEAD (row)) + { + continue ; + } + /* compact the column */ + *new_cp++ = row ; + /* add row's external degree */ + score += Row [row].shared1.degree - 1 ; + /* guard against integer overflow */ + score = MIN (score, n_col) ; + } + /* determine pruned column length */ + col_length = (int) (new_cp - &A [Col [c].start]) ; + if (col_length == 0) + { + /* a newly-made null column (all rows in this col are "dense" */ + /* and have already been killed) */ + DEBUG0 (("Newly null killed: %d\n", c)) ; + Col [c].shared2.order = --n_col2 ; + KILL_PRINCIPAL_COL (c) ; + } + else + { + /* set column length and set score */ + assert (score >= 0) ; + assert (score <= n_col) ; + Col [c].length = col_length ; + Col [c].shared2.score = score ; + } + } + DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ; + + /* At this point, all empty rows and columns are dead. All live columns */ + /* are "clean" (containing no dead rows) and simplicial (no supercolumns */ + /* yet). Rows may contain dead columns, but all live rows contain at */ + /* least one live column. */ + +#ifndef NDEBUG + debug_structures (n_row, n_col, Row, Col, A, n_col2) ; +#endif + + /* === Initialize degree lists ========================================== */ + +#ifndef NDEBUG + debug_count = 0 ; +#endif + + /* clear the hash buckets */ + for (c = 0 ; c <= n_col ; c++) + { + head [c] = EMPTY ; + } + min_score = n_col ; + /* place in reverse order, so low column indices are at the front */ + /* of the lists. This is to encourage natural tie-breaking */ + for (c = n_col-1 ; c >= 0 ; c--) + { + /* only add principal columns to degree lists */ + if (COL_IS_ALIVE (c)) + { + DEBUG4 (("place %d score %d minscore %d ncol %d\n", + c, Col [c].shared2.score, min_score, n_col)) ; + + /* === Add columns score to DList =============================== */ + + score = Col [c].shared2.score ; + + assert (min_score >= 0) ; + assert (min_score <= n_col) ; + assert (score >= 0) ; + assert (score <= n_col) ; + assert (head [score] >= EMPTY) ; + + /* now add this column to dList at proper score location */ + next_col = head [score] ; + Col [c].shared3.prev = EMPTY ; + Col [c].shared4.degree_next = next_col ; + + /* if there already was a column with the same score, set its */ + /* previous pointer to this new column */ + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = c ; + } + head [score] = c ; + + /* see if this score is less than current min */ + min_score = MIN (min_score, score) ; + +#ifndef NDEBUG + debug_count++ ; +#endif + } + } + +#ifndef NDEBUG + DEBUG0 (("Live cols %d out of %d, non-princ: %d\n", + debug_count, n_col, n_col-debug_count)) ; + assert (debug_count == n_col2) ; + debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ; +#endif + + /* === Return number of remaining columns, and max row degree =========== */ + + *p_n_col2 = n_col2 ; + *p_n_row2 = n_row2 ; + *p_max_deg = max_deg ; +} + + +/* ========================================================================== */ +/* === find_ordering ======================================================== */ +/* ========================================================================== */ + +/* + Order the principal columns of the supercolumn form of the matrix + (no supercolumns on input). Uses a minimum approximate column minimum + degree ordering method. Not user-callable. +*/ + +PRIVATE int find_ordering /* return the number of garbage collections */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows of A */ + int n_col, /* number of columns of A */ + int Alen, /* size of A, 2*nnz + elbow_room or larger */ + RowInfo Row [], /* of size n_row+1 */ + ColInfo Col [], /* of size n_col+1 */ + int A [], /* column form and row form of A */ + int head [], /* of size n_col+1 */ + int n_col2, /* Remaining columns to order */ + int max_deg, /* Maximum row degree */ + int pfree /* index of first free slot (2*nnz on entry) */ +) +{ + /* === Local variables ================================================== */ + + int k ; /* current pivot ordering step */ + int pivot_col ; /* current pivot column */ + int *cp ; /* a column pointer */ + int *rp ; /* a row pointer */ + int pivot_row ; /* current pivot row */ + int *new_cp ; /* modified column pointer */ + int *new_rp ; /* modified row pointer */ + int pivot_row_start ; /* pointer to start of pivot row */ + int pivot_row_degree ; /* # of columns in pivot row */ + int pivot_row_length ; /* # of supercolumns in pivot row */ + int pivot_col_score ; /* score of pivot column */ + int needed_memory ; /* free space needed for pivot row */ + int *cp_end ; /* pointer to the end of a column */ + int *rp_end ; /* pointer to the end of a row */ + int row ; /* a row index */ + int col ; /* a column index */ + int max_score ; /* maximum possible score */ + int cur_score ; /* score of current column */ + unsigned int hash ; /* hash value for supernode detection */ + int head_column ; /* head of hash bucket */ + int first_col ; /* first column in hash bucket */ + int tag_mark ; /* marker value for mark array */ + int row_mark ; /* Row [row].shared2.mark */ + int set_difference ; /* set difference size of row with pivot row */ + int min_score ; /* smallest column score */ + int col_thickness ; /* "thickness" (# of columns in a supercol) */ + int max_mark ; /* maximum value of tag_mark */ + int pivot_col_thickness ; /* number of columns represented by pivot col */ + int prev_col ; /* Used by Dlist operations. */ + int next_col ; /* Used by Dlist operations. */ + int ngarbage ; /* number of garbage collections performed */ +#ifndef NDEBUG + int debug_d ; /* debug loop counter */ + int debug_step = 0 ; /* debug loop counter */ +#endif + + /* === Initialization and clear mark ==================================== */ + + max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ + tag_mark = clear_mark (n_row, Row) ; + min_score = 0 ; + ngarbage = 0 ; + DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ; + + /* === Order the columns ================================================ */ + + for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */) + { + +#ifndef NDEBUG + if (debug_step % 100 == 0) + { + DEBUG0 (("\n... Step k: %d out of n_col2: %d\n", k, n_col2)) ; + } + else + { + DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ; + } + debug_step++ ; + debug_deg_lists (n_row, n_col, Row, Col, head, + min_score, n_col2-k, max_deg) ; + debug_matrix (n_row, n_col, Row, Col, A) ; +#endif + + /* === Select pivot column, and order it ============================ */ + + /* make sure degree list isn't empty */ + assert (min_score >= 0) ; + assert (min_score <= n_col) ; + assert (head [min_score] >= EMPTY) ; + +#ifndef NDEBUG + for (debug_d = 0 ; debug_d < min_score ; debug_d++) + { + assert (head [debug_d] == EMPTY) ; + } +#endif + + /* get pivot column from head of minimum degree list */ + while (head [min_score] == EMPTY && min_score < n_col) + { + min_score++ ; + } + pivot_col = head [min_score] ; + assert (pivot_col >= 0 && pivot_col <= n_col) ; + next_col = Col [pivot_col].shared4.degree_next ; + head [min_score] = next_col ; + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = EMPTY ; + } + + assert (COL_IS_ALIVE (pivot_col)) ; + DEBUG3 (("Pivot col: %d\n", pivot_col)) ; + + /* remember score for defrag check */ + pivot_col_score = Col [pivot_col].shared2.score ; + + /* the pivot column is the kth column in the pivot order */ + Col [pivot_col].shared2.order = k ; + + /* increment order count by column thickness */ + pivot_col_thickness = Col [pivot_col].shared1.thickness ; + k += pivot_col_thickness ; + assert (pivot_col_thickness > 0) ; + + /* === Garbage_collection, if necessary ============================= */ + + needed_memory = MIN (pivot_col_score, n_col - k) ; + if (pfree + needed_memory >= Alen) + { + pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; + ngarbage++ ; + /* after garbage collection we will have enough */ + assert (pfree + needed_memory < Alen) ; + /* garbage collection has wiped out the Row[].shared2.mark array */ + tag_mark = clear_mark (n_row, Row) ; +#ifndef NDEBUG + debug_matrix (n_row, n_col, Row, Col, A) ; +#endif + } + + /* === Compute pivot row pattern ==================================== */ + + /* get starting location for this new merged row */ + pivot_row_start = pfree ; + + /* initialize new row counts to zero */ + pivot_row_degree = 0 ; + + /* tag pivot column as having been visited so it isn't included */ + /* in merged pivot row */ + Col [pivot_col].shared1.thickness = -pivot_col_thickness ; + + /* pivot row is the union of all rows in the pivot column pattern */ + cp = &A [Col [pivot_col].start] ; + cp_end = cp + Col [pivot_col].length ; + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; + /* skip if row is dead */ + if (ROW_IS_DEAD (row)) + { + continue ; + } + rp = &A [Row [row].start] ; + rp_end = rp + Row [row].length ; + while (rp < rp_end) + { + /* get a column */ + col = *rp++ ; + /* add the column, if alive and untagged */ + col_thickness = Col [col].shared1.thickness ; + if (col_thickness > 0 && COL_IS_ALIVE (col)) + { + /* tag column in pivot row */ + Col [col].shared1.thickness = -col_thickness ; + assert (pfree < Alen) ; + /* place column in pivot row */ + A [pfree++] = col ; + pivot_row_degree += col_thickness ; + } + } + } + + /* clear tag on pivot column */ + Col [pivot_col].shared1.thickness = pivot_col_thickness ; + max_deg = MAX (max_deg, pivot_row_degree) ; + +#ifndef NDEBUG + DEBUG3 (("check2\n")) ; + debug_mark (n_row, Row, tag_mark, max_mark) ; +#endif + + /* === Kill all rows used to construct pivot row ==================== */ + + /* also kill pivot row, temporarily */ + cp = &A [Col [pivot_col].start] ; + cp_end = cp + Col [pivot_col].length ; + while (cp < cp_end) + { + /* may be killing an already dead row */ + row = *cp++ ; + DEBUG2 (("Kill row in pivot col: %d\n", row)) ; + KILL_ROW (row) ; + } + + /* === Select a row index to use as the new pivot row =============== */ + + pivot_row_length = pfree - pivot_row_start ; + if (pivot_row_length > 0) + { + /* pick the "pivot" row arbitrarily (first row in col) */ + pivot_row = A [Col [pivot_col].start] ; + DEBUG2 (("Pivotal row is %d\n", pivot_row)) ; + } + else + { + /* there is no pivot row, since it is of zero length */ + pivot_row = EMPTY ; + assert (pivot_row_length == 0) ; + } + assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ; + + /* === Approximate degree computation =============================== */ + + /* Here begins the computation of the approximate degree. The column */ + /* score is the sum of the pivot row "length", plus the size of the */ + /* set differences of each row in the column minus the pattern of the */ + /* pivot row itself. The column ("thickness") itself is also */ + /* excluded from the column score (we thus use an approximate */ + /* external degree). */ + + /* The time taken by the following code (compute set differences, and */ + /* add them up) is proportional to the size of the data structure */ + /* being scanned - that is, the sum of the sizes of each column in */ + /* the pivot row. Thus, the amortized time to compute a column score */ + /* is proportional to the size of that column (where size, in this */ + /* context, is the column "length", or the number of row indices */ + /* in that column). The number of row indices in a column is */ + /* monotonically non-decreasing, from the length of the original */ + /* column on input to colamd. */ + + /* === Compute set differences ====================================== */ + + DEBUG1 (("** Computing set differences phase. **\n")) ; + + /* pivot row is currently dead - it will be revived later. */ + + DEBUG2 (("Pivot row: ")) ; + /* for each column in pivot row */ + rp = &A [pivot_row_start] ; + rp_end = rp + pivot_row_length ; + while (rp < rp_end) + { + col = *rp++ ; + assert (COL_IS_ALIVE (col) && col != pivot_col) ; + DEBUG2 (("Col: %d\n", col)) ; + + /* clear tags used to construct pivot row pattern */ + col_thickness = -Col [col].shared1.thickness ; + assert (col_thickness > 0) ; + Col [col].shared1.thickness = col_thickness ; + + /* === Remove column from degree list =========================== */ + + cur_score = Col [col].shared2.score ; + prev_col = Col [col].shared3.prev ; + next_col = Col [col].shared4.degree_next ; + assert (cur_score >= 0) ; + assert (cur_score <= n_col) ; + assert (cur_score >= EMPTY) ; + if (prev_col == EMPTY) + { + head [cur_score] = next_col ; + } + else + { + Col [prev_col].shared4.degree_next = next_col ; + } + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = prev_col ; + } + + /* === Scan the column ========================================== */ + + cp = &A [Col [col].start] ; + cp_end = cp + Col [col].length ; + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + row_mark = Row [row].shared2.mark ; + /* skip if dead */ + if (ROW_IS_MARKED_DEAD (row_mark)) + { + continue ; + } + assert (row != pivot_row) ; + set_difference = row_mark - tag_mark ; + /* check if the row has been seen yet */ + if (set_difference < 0) + { + assert (Row [row].shared1.degree <= max_deg) ; + set_difference = Row [row].shared1.degree ; + } + /* subtract column thickness from this row's set difference */ + set_difference -= col_thickness ; + assert (set_difference >= 0) ; + /* absorb this row if the set difference becomes zero */ + if (set_difference == 0) + { + DEBUG1 (("aggressive absorption. Row: %d\n", row)) ; + KILL_ROW (row) ; + } + else + { + /* save the new mark */ + Row [row].shared2.mark = set_difference + tag_mark ; + } + } + } + +#ifndef NDEBUG + debug_deg_lists (n_row, n_col, Row, Col, head, + min_score, n_col2-k-pivot_row_degree, max_deg) ; +#endif + + /* === Add up set differences for each column ======================= */ + + DEBUG1 (("** Adding set differences phase. **\n")) ; + + /* for each column in pivot row */ + rp = &A [pivot_row_start] ; + rp_end = rp + pivot_row_length ; + while (rp < rp_end) + { + /* get a column */ + col = *rp++ ; + assert (COL_IS_ALIVE (col) && col != pivot_col) ; + hash = 0 ; + cur_score = 0 ; + cp = &A [Col [col].start] ; + /* compact the column */ + new_cp = cp ; + cp_end = cp + Col [col].length ; + + DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ; + + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + assert(row >= 0 && row < n_row) ; + row_mark = Row [row].shared2.mark ; + /* skip if dead */ + if (ROW_IS_MARKED_DEAD (row_mark)) + { + continue ; + } + assert (row_mark > tag_mark) ; + /* compact the column */ + *new_cp++ = row ; + /* compute hash function */ + hash += row ; + /* add set difference */ + cur_score += row_mark - tag_mark ; + /* integer overflow... */ + cur_score = MIN (cur_score, n_col) ; + } + + /* recompute the column's length */ + Col [col].length = (int) (new_cp - &A [Col [col].start]) ; + + /* === Further mass elimination ================================= */ + + if (Col [col].length == 0) + { + DEBUG1 (("further mass elimination. Col: %d\n", col)) ; + /* nothing left but the pivot row in this column */ + KILL_PRINCIPAL_COL (col) ; + pivot_row_degree -= Col [col].shared1.thickness ; + assert (pivot_row_degree >= 0) ; + /* order it */ + Col [col].shared2.order = k ; + /* increment order count by column thickness */ + k += Col [col].shared1.thickness ; + } + else + { + /* === Prepare for supercolumn detection ==================== */ + + DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ; + + /* save score so far */ + Col [col].shared2.score = cur_score ; + + /* add column to hash table, for supercolumn detection */ + hash %= n_col + 1 ; + + DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ; + assert (hash <= n_col) ; + + head_column = head [hash] ; + if (head_column > EMPTY) + { + /* degree list "hash" is non-empty, use prev (shared3) of */ + /* first column in degree list as head of hash bucket */ + first_col = Col [head_column].shared3.headhash ; + Col [head_column].shared3.headhash = col ; + } + else + { + /* degree list "hash" is empty, use head as hash bucket */ + first_col = - (head_column + 2) ; + head [hash] = - (col + 2) ; + } + Col [col].shared4.hash_next = first_col ; + + /* save hash function in Col [col].shared3.hash */ + Col [col].shared3.hash = (int) hash ; + assert (COL_IS_ALIVE (col)) ; + } + } + + /* The approximate external column degree is now computed. */ + + /* === Supercolumn detection ======================================== */ + + DEBUG1 (("** Supercolumn detection phase. **\n")) ; + + detect_super_cols ( +#ifndef NDEBUG + n_col, Row, +#endif + Col, A, head, pivot_row_start, pivot_row_length) ; + + /* === Kill the pivotal column ====================================== */ + + KILL_PRINCIPAL_COL (pivot_col) ; + + /* === Clear mark =================================================== */ + + tag_mark += (max_deg + 1) ; + if (tag_mark >= max_mark) + { + DEBUG1 (("clearing tag_mark\n")) ; + tag_mark = clear_mark (n_row, Row) ; + } +#ifndef NDEBUG + DEBUG3 (("check3\n")) ; + debug_mark (n_row, Row, tag_mark, max_mark) ; +#endif + + /* === Finalize the new pivot row, and column scores ================ */ + + DEBUG1 (("** Finalize scores phase. **\n")) ; + + /* for each column in pivot row */ + rp = &A [pivot_row_start] ; + /* compact the pivot row */ + new_rp = rp ; + rp_end = rp + pivot_row_length ; + while (rp < rp_end) + { + col = *rp++ ; + /* skip dead columns */ + if (COL_IS_DEAD (col)) + { + continue ; + } + *new_rp++ = col ; + /* add new pivot row to column */ + A [Col [col].start + (Col [col].length++)] = pivot_row ; + + /* retrieve score so far and add on pivot row's degree. */ + /* (we wait until here for this in case the pivot */ + /* row's degree was reduced due to mass elimination). */ + cur_score = Col [col].shared2.score + pivot_row_degree ; + + /* calculate the max possible score as the number of */ + /* external columns minus the 'k' value minus the */ + /* columns thickness */ + max_score = n_col - k - Col [col].shared1.thickness ; + + /* make the score the external degree of the union-of-rows */ + cur_score -= Col [col].shared1.thickness ; + + /* make sure score is less or equal than the max score */ + cur_score = MIN (cur_score, max_score) ; + assert (cur_score >= 0) ; + + /* store updated score */ + Col [col].shared2.score = cur_score ; + + /* === Place column back in degree list ========================= */ + + assert (min_score >= 0) ; + assert (min_score <= n_col) ; + assert (cur_score >= 0) ; + assert (cur_score <= n_col) ; + assert (head [cur_score] >= EMPTY) ; + next_col = head [cur_score] ; + Col [col].shared4.degree_next = next_col ; + Col [col].shared3.prev = EMPTY ; + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = col ; + } + head [cur_score] = col ; + + /* see if this score is less than current min */ + min_score = MIN (min_score, cur_score) ; + + } + +#ifndef NDEBUG + debug_deg_lists (n_row, n_col, Row, Col, head, + min_score, n_col2-k, max_deg) ; +#endif + + /* === Resurrect the new pivot row ================================== */ + + if (pivot_row_degree > 0) + { + /* update pivot row length to reflect any cols that were killed */ + /* during super-col detection and mass elimination */ + Row [pivot_row].start = pivot_row_start ; + Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ; + Row [pivot_row].shared1.degree = pivot_row_degree ; + Row [pivot_row].shared2.mark = 0 ; + /* pivot row is no longer dead */ + } + } + + /* === All principal columns have now been ordered ====================== */ + + return (ngarbage) ; +} + + +/* ========================================================================== */ +/* === order_children ======================================================= */ +/* ========================================================================== */ + +/* + The find_ordering routine has ordered all of the principal columns (the + representatives of the supercolumns). The non-principal columns have not + yet been ordered. This routine orders those columns by walking up the + parent tree (a column is a child of the column which absorbed it). The + final permutation vector is then placed in p [0 ... n_col-1], with p [0] + being the first column, and p [n_col-1] being the last. It doesn't look + like it at first glance, but be assured that this routine takes time linear + in the number of columns. Although not immediately obvious, the time + taken by this routine is O (n_col), that is, linear in the number of + columns. Not user-callable. +*/ + +PRIVATE void order_children +( + /* === Parameters ======================================================= */ + + int n_col, /* number of columns of A */ + ColInfo Col [], /* of size n_col+1 */ + int p [] /* p [0 ... n_col-1] is the column permutation*/ +) +{ + /* === Local variables ================================================== */ + + int i ; /* loop counter for all columns */ + int c ; /* column index */ + int parent ; /* index of column's parent */ + int order ; /* column's order */ + + /* === Order each non-principal column ================================== */ + + for (i = 0 ; i < n_col ; i++) + { + /* find an un-ordered non-principal column */ + assert (COL_IS_DEAD (i)) ; + if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY) + { + parent = i ; + /* once found, find its principal parent */ + do + { + parent = Col [parent].shared1.parent ; + } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; + + /* now, order all un-ordered non-principal columns along path */ + /* to this parent. collapse tree at the same time */ + c = i ; + /* get order of parent */ + order = Col [parent].shared2.order ; + + do + { + assert (Col [c].shared2.order == EMPTY) ; + + /* order this column */ + Col [c].shared2.order = order++ ; + /* collaps tree */ + Col [c].shared1.parent = parent ; + + /* get immediate parent of this column */ + c = Col [c].shared1.parent ; + + /* continue until we hit an ordered column. There are */ + /* guarranteed not to be anymore unordered columns */ + /* above an ordered column */ + } while (Col [c].shared2.order == EMPTY) ; + + /* re-order the super_col parent to largest order for this group */ + Col [parent].shared2.order = order ; + } + } + + /* === Generate the permutation ========================================= */ + + for (c = 0 ; c < n_col ; c++) + { + p [Col [c].shared2.order] = c ; + } +} + + +/* ========================================================================== */ +/* === detect_super_cols ==================================================== */ +/* ========================================================================== */ + +/* + Detects supercolumns by finding matches between columns in the hash buckets. + Check amongst columns in the set A [row_start ... row_start + row_length-1]. + The columns under consideration are currently *not* in the degree lists, + and have already been placed in the hash buckets. + + The hash bucket for columns whose hash function is equal to h is stored + as follows: + + if head [h] is >= 0, then head [h] contains a degree list, so: + + head [h] is the first column in degree bucket h. + Col [head [h]].headhash gives the first column in hash bucket h. + + otherwise, the degree list is empty, and: + + -(head [h] + 2) is the first column in hash bucket h. + + For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous + column" pointer. Col [c].shared3.hash is used instead as the hash number + for that column. The value of Col [c].shared4.hash_next is the next column + in the same hash bucket. + + Assuming no, or "few" hash collisions, the time taken by this routine is + linear in the sum of the sizes (lengths) of each column whose score has + just been computed in the approximate degree computation. + Not user-callable. +*/ + +PRIVATE void detect_super_cols +( + /* === Parameters ======================================================= */ + +#ifndef NDEBUG + /* these two parameters are only needed when debugging is enabled: */ + int n_col, /* number of columns of A */ + RowInfo Row [], /* of size n_row+1 */ +#endif + ColInfo Col [], /* of size n_col+1 */ + int A [], /* row indices of A */ + int head [], /* head of degree lists and hash buckets */ + int row_start, /* pointer to set of columns to check */ + int row_length /* number of columns to check */ +) +{ + /* === Local variables ================================================== */ + + int hash ; /* hash # for a column */ + int *rp ; /* pointer to a row */ + int c ; /* a column index */ + int super_c ; /* column index of the column to absorb into */ + int *cp1 ; /* column pointer for column super_c */ + int *cp2 ; /* column pointer for column c */ + int length ; /* length of column super_c */ + int prev_c ; /* column preceding c in hash bucket */ + int i ; /* loop counter */ + int *rp_end ; /* pointer to the end of the row */ + int col ; /* a column index in the row to check */ + int head_column ; /* first column in hash bucket or degree list */ + int first_col ; /* first column in hash bucket */ + + /* === Consider each column in the row ================================== */ + + rp = &A [row_start] ; + rp_end = rp + row_length ; + while (rp < rp_end) + { + col = *rp++ ; + if (COL_IS_DEAD (col)) + { + continue ; + } + + /* get hash number for this column */ + hash = Col [col].shared3.hash ; + assert (hash <= n_col) ; + + /* === Get the first column in this hash bucket ===================== */ + + head_column = head [hash] ; + if (head_column > EMPTY) + { + first_col = Col [head_column].shared3.headhash ; + } + else + { + first_col = - (head_column + 2) ; + } + + /* === Consider each column in the hash bucket ====================== */ + + for (super_c = first_col ; super_c != EMPTY ; + super_c = Col [super_c].shared4.hash_next) + { + assert (COL_IS_ALIVE (super_c)) ; + assert (Col [super_c].shared3.hash == hash) ; + length = Col [super_c].length ; + + /* prev_c is the column preceding column c in the hash bucket */ + prev_c = super_c ; + + /* === Compare super_c with all columns after it ================ */ + + for (c = Col [super_c].shared4.hash_next ; + c != EMPTY ; c = Col [c].shared4.hash_next) + { + assert (c != super_c) ; + assert (COL_IS_ALIVE (c)) ; + assert (Col [c].shared3.hash == hash) ; + + /* not identical if lengths or scores are different */ + if (Col [c].length != length || + Col [c].shared2.score != Col [super_c].shared2.score) + { + prev_c = c ; + continue ; + } + + /* compare the two columns */ + cp1 = &A [Col [super_c].start] ; + cp2 = &A [Col [c].start] ; + + for (i = 0 ; i < length ; i++) + { + /* the columns are "clean" (no dead rows) */ + assert (ROW_IS_ALIVE (*cp1)) ; + assert (ROW_IS_ALIVE (*cp2)) ; + /* row indices will same order for both supercols, */ + /* no gather scatter nessasary */ + if (*cp1++ != *cp2++) + { + break ; + } + } + + /* the two columns are different if the for-loop "broke" */ + if (i != length) + { + prev_c = c ; + continue ; + } + + /* === Got it! two columns are identical =================== */ + + assert (Col [c].shared2.score == Col [super_c].shared2.score) ; + + Col [super_c].shared1.thickness += Col [c].shared1.thickness ; + Col [c].shared1.parent = super_c ; + KILL_NON_PRINCIPAL_COL (c) ; + /* order c later, in order_children() */ + Col [c].shared2.order = EMPTY ; + /* remove c from hash bucket */ + Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; + } + } + + /* === Empty this hash bucket ======================================= */ + + if (head_column > EMPTY) + { + /* corresponding degree list "hash" is not empty */ + Col [head_column].shared3.headhash = EMPTY ; + } + else + { + /* corresponding degree list "hash" is empty */ + head [hash] = EMPTY ; + } + } +} + + +/* ========================================================================== */ +/* === garbage_collection =================================================== */ +/* ========================================================================== */ + +/* + Defragments and compacts columns and rows in the workspace A. Used when + all avaliable memory has been used while performing row merging. Returns + the index of the first free position in A, after garbage collection. The + time taken by this routine is linear is the size of the array A, which is + itself linear in the number of nonzeros in the input matrix. + Not user-callable. +*/ + +PRIVATE int garbage_collection /* returns the new value of pfree */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows */ + int n_col, /* number of columns */ + RowInfo Row [], /* row info */ + ColInfo Col [], /* column info */ + int A [], /* A [0 ... Alen-1] holds the matrix */ + int *pfree /* &A [0] ... pfree is in use */ +) +{ + /* === Local variables ================================================== */ + + int *psrc ; /* source pointer */ + int *pdest ; /* destination pointer */ + int j ; /* counter */ + int r ; /* a row index */ + int c ; /* a column index */ + int length ; /* length of a row or column */ + +#ifndef NDEBUG + int debug_rows ; + DEBUG0 (("Defrag..\n")) ; + for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ; + debug_rows = 0 ; +#endif + + /* === Defragment the columns =========================================== */ + + pdest = &A[0] ; + for (c = 0 ; c < n_col ; c++) + { + if (COL_IS_ALIVE (c)) + { + psrc = &A [Col [c].start] ; + + /* move and compact the column */ + assert (pdest <= psrc) ; + Col [c].start = (int) (pdest - &A [0]) ; + length = Col [c].length ; + for (j = 0 ; j < length ; j++) + { + r = *psrc++ ; + if (ROW_IS_ALIVE (r)) + { + *pdest++ = r ; + } + } + Col [c].length = (int) (pdest - &A [Col [c].start]) ; + } + } + + /* === Prepare to defragment the rows =================================== */ + + for (r = 0 ; r < n_row ; r++) + { + if (ROW_IS_ALIVE (r)) + { + if (Row [r].length == 0) + { + /* this row is of zero length. cannot compact it, so kill it */ + DEBUG0 (("Defrag row kill\n")) ; + KILL_ROW (r) ; + } + else + { + /* save first column index in Row [r].shared2.first_column */ + psrc = &A [Row [r].start] ; + Row [r].shared2.first_column = *psrc ; + assert (ROW_IS_ALIVE (r)) ; + /* flag the start of the row with the one's complement of row */ + *psrc = ONES_COMPLEMENT (r) ; +#ifndef NDEBUG + debug_rows++ ; +#endif + } + } + } + + /* === Defragment the rows ============================================== */ + + psrc = pdest ; + while (psrc < pfree) + { + /* find a negative number ... the start of a row */ + if (*psrc++ < 0) + { + psrc-- ; + /* get the row index */ + r = ONES_COMPLEMENT (*psrc) ; + assert (r >= 0 && r < n_row) ; + /* restore first column index */ + *psrc = Row [r].shared2.first_column ; + assert (ROW_IS_ALIVE (r)) ; + + /* move and compact the row */ + assert (pdest <= psrc) ; + Row [r].start = (int) (pdest - &A [0]) ; + length = Row [r].length ; + for (j = 0 ; j < length ; j++) + { + c = *psrc++ ; + if (COL_IS_ALIVE (c)) + { + *pdest++ = c ; + } + } + Row [r].length = (int) (pdest - &A [Row [r].start]) ; +#ifndef NDEBUG + debug_rows-- ; +#endif + } + } + /* ensure we found all the rows */ + assert (debug_rows == 0) ; + + /* === Return the new value of pfree ==================================== */ + + return ((int) (pdest - &A [0])) ; +} + + +/* ========================================================================== */ +/* === clear_mark =========================================================== */ +/* ========================================================================== */ + +/* + Clears the Row [].shared2.mark array, and returns the new tag_mark. + Return value is the new tag_mark. Not user-callable. +*/ + +PRIVATE int clear_mark /* return the new value for tag_mark */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows in A */ + RowInfo Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ +) +{ + /* === Local variables ================================================== */ + + int r ; + + DEBUG0 (("Clear mark\n")) ; + for (r = 0 ; r < n_row ; r++) + { + if (ROW_IS_ALIVE (r)) + { + Row [r].shared2.mark = 0 ; + } + } + return (1) ; +} + + +/* ========================================================================== */ +/* === debugging routines =================================================== */ +/* ========================================================================== */ + +/* When debugging is disabled, the remainder of this file is ignored. */ + +#ifndef NDEBUG + + +/* ========================================================================== */ +/* === debug_structures ===================================================== */ +/* ========================================================================== */ + +/* + At this point, all empty rows and columns are dead. All live columns + are "clean" (containing no dead rows) and simplicial (no supercolumns + yet). Rows may contain dead columns, but all live rows contain at + least one live column. +*/ + +PRIVATE void debug_structures +( + /* === Parameters ======================================================= */ + + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [], + int n_col2 +) +{ + /* === Local variables ================================================== */ + + int i ; + int c ; + int *cp ; + int *cp_end ; + int len ; + int score ; + int r ; + int *rp ; + int *rp_end ; + int deg ; + + /* === Check A, Row, and Col ============================================ */ + + for (c = 0 ; c < n_col ; c++) + { + if (COL_IS_ALIVE (c)) + { + len = Col [c].length ; + score = Col [c].shared2.score ; + DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ; + assert (len > 0) ; + assert (score >= 0) ; + assert (Col [c].shared1.thickness == 1) ; + cp = &A [Col [c].start] ; + cp_end = cp + len ; + while (cp < cp_end) + { + r = *cp++ ; + assert (ROW_IS_ALIVE (r)) ; + } + } + else + { + i = Col [c].shared2.order ; + assert (i >= n_col2 && i < n_col) ; + } + } + + for (r = 0 ; r < n_row ; r++) + { + if (ROW_IS_ALIVE (r)) + { + i = 0 ; + len = Row [r].length ; + deg = Row [r].shared1.degree ; + assert (len > 0) ; + assert (deg > 0) ; + rp = &A [Row [r].start] ; + rp_end = rp + len ; + while (rp < rp_end) + { + c = *rp++ ; + if (COL_IS_ALIVE (c)) + { + i++ ; + } + } + assert (i > 0) ; + } + } +} + + +/* ========================================================================== */ +/* === debug_deg_lists ====================================================== */ +/* ========================================================================== */ + +/* + Prints the contents of the degree lists. Counts the number of columns + in the degree list and compares it to the total it should have. Also + checks the row degrees. +*/ + +PRIVATE void debug_deg_lists +( + /* === Parameters ======================================================= */ + + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int head [], + int min_score, + int should, + int max_deg +) +{ + /* === Local variables ================================================== */ + + int deg ; + int col ; + int have ; + int row ; + + /* === Check the degree lists =========================================== */ + + if (n_col > 10000 && debug_colamd <= 0) + { + return ; + } + have = 0 ; + DEBUG4 (("Degree lists: %d\n", min_score)) ; + for (deg = 0 ; deg <= n_col ; deg++) + { + col = head [deg] ; + if (col == EMPTY) + { + continue ; + } + DEBUG4 (("%d:", deg)) ; + while (col != EMPTY) + { + DEBUG4 ((" %d", col)) ; + have += Col [col].shared1.thickness ; + assert (COL_IS_ALIVE (col)) ; + col = Col [col].shared4.degree_next ; + } + DEBUG4 (("\n")) ; + } + DEBUG4 (("should %d have %d\n", should, have)) ; + assert (should == have) ; + + /* === Check the row degrees ============================================ */ + + if (n_row > 10000 && debug_colamd <= 0) + { + return ; + } + for (row = 0 ; row < n_row ; row++) + { + if (ROW_IS_ALIVE (row)) + { + assert (Row [row].shared1.degree <= max_deg) ; + } + } +} + + +/* ========================================================================== */ +/* === debug_mark =========================================================== */ +/* ========================================================================== */ + +/* + Ensures that the tag_mark is less that the maximum and also ensures that + each entry in the mark array is less than the tag mark. +*/ + +PRIVATE void debug_mark +( + /* === Parameters ======================================================= */ + + int n_row, + RowInfo Row [], + int tag_mark, + int max_mark +) +{ + /* === Local variables ================================================== */ + + int r ; + + /* === Check the Row marks ============================================== */ + + assert (tag_mark > 0 && tag_mark <= max_mark) ; + if (n_row > 10000 && debug_colamd <= 0) + { + return ; + } + for (r = 0 ; r < n_row ; r++) + { + assert (Row [r].shared2.mark < tag_mark) ; + } +} + + +/* ========================================================================== */ +/* === debug_matrix ========================================================= */ +/* ========================================================================== */ + +/* + Prints out the contents of the columns and the rows. +*/ + +PRIVATE void debug_matrix +( + /* === Parameters ======================================================= */ + + int n_row, + int n_col, + RowInfo Row [], + ColInfo Col [], + int A [] +) +{ + /* === Local variables ================================================== */ + + int r ; + int c ; + int *rp ; + int *rp_end ; + int *cp ; + int *cp_end ; + + /* === Dump the rows and columns of the matrix ========================== */ + + if (debug_colamd < 3) + { + return ; + } + DEBUG3 (("DUMP MATRIX:\n")) ; + for (r = 0 ; r < n_row ; r++) + { + DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ; + if (ROW_IS_DEAD (r)) + { + continue ; + } + DEBUG3 (("start %d length %d degree %d\n", + Row [r].start, Row [r].length, Row [r].shared1.degree)) ; + rp = &A [Row [r].start] ; + rp_end = rp + Row [r].length ; + while (rp < rp_end) + { + c = *rp++ ; + DEBUG3 ((" %d col %d\n", COL_IS_ALIVE (c), c)) ; + } + } + + for (c = 0 ; c < n_col ; c++) + { + DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ; + if (COL_IS_DEAD (c)) + { + continue ; + } + DEBUG3 (("start %d length %d shared1 %d shared2 %d\n", + Col [c].start, Col [c].length, + Col [c].shared1.thickness, Col [c].shared2.score)) ; + cp = &A [Col [c].start] ; + cp_end = cp + Col [c].length ; + while (cp < cp_end) + { + r = *cp++ ; + DEBUG3 ((" %d row %d\n", ROW_IS_ALIVE (r), r)) ; + } + } +} + +#endif + diff --git a/intern/opennl/superlu/colamd.h b/intern/opennl/superlu/colamd.h new file mode 100644 index 00000000000..00783983b27 --- /dev/null +++ b/intern/opennl/superlu/colamd.h @@ -0,0 +1,67 @@ +/* ========================================================================== */ +/* === colamd prototypes and definitions ==================================== */ +/* ========================================================================== */ + +/* + This is the colamd include file, + + http://www.cise.ufl.edu/~davis/colamd/colamd.h + + for use in the colamd.c, colamdmex.c, and symamdmex.c files located at + + http://www.cise.ufl.edu/~davis/colamd/ + + See those files for a description of colamd and symamd, and for the + copyright notice, which also applies to this file. + + August 3, 1998. Version 1.0. +*/ + +/* ========================================================================== */ +/* === Definitions ========================================================== */ +/* ========================================================================== */ + +/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ +#define COLAMD_KNOBS 20 + +/* number of output statistics. Only A [0..2] are currently used. */ +#define COLAMD_STATS 20 + +/* knobs [0] and A [0]: dense row knob and output statistic. */ +#define COLAMD_DENSE_ROW 0 + +/* knobs [1] and A [1]: dense column knob and output statistic. */ +#define COLAMD_DENSE_COL 1 + +/* A [2]: memory defragmentation count output statistic */ +#define COLAMD_DEFRAG_COUNT 2 + +/* A [3]: whether or not the input columns were jumbled or had duplicates */ +#define COLAMD_JUMBLED_COLS 3 + +/* ========================================================================== */ +/* === Prototypes of user-callable routines ================================= */ +/* ========================================================================== */ + +int colamd_recommended /* returns recommended value of Alen */ +( + int nnz, /* nonzeros in A */ + int n_row, /* number of rows in A */ + int n_col /* number of columns in A */ +) ; + +void colamd_set_defaults /* sets default parameters */ +( /* knobs argument is modified on output */ + double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ +) ; + +int colamd /* returns TRUE if successful, FALSE otherwise*/ +( /* A and p arguments are modified on output */ + int n_row, /* number of rows in A */ + int n_col, /* number of columns in A */ + int Alen, /* size of the array A */ + int A [], /* row indices of A, of size Alen */ + int p [], /* column pointers of A, of size n_col+1 */ + double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ +) ; + diff --git a/intern/opennl/superlu/get_perm_c.c b/intern/opennl/superlu/get_perm_c.c new file mode 100644 index 00000000000..9cdf5a876bf --- /dev/null +++ b/intern/opennl/superlu/get_perm_c.c @@ -0,0 +1,453 @@ +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ + +#include "ssp_defs.h" +#include "colamd.h" + +extern int genmmd_(int *, int *, int *, int *, int *, int *, int *, + int *, int *, int *, int *, int *); + +void +get_colamd( + const int m, /* number of rows in matrix A. */ + const int n, /* number of columns in matrix A. */ + const int nnz,/* number of nonzeros in matrix A. */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *perm_c /* out - the column permutation vector. */ + ) +{ + int Alen, *A, i, info, *p; + double *knobs; + + Alen = colamd_recommended(nnz, m, n); + + if ( !(knobs = (double *) SUPERLU_MALLOC(COLAMD_KNOBS * sizeof(double))) ) + ABORT("Malloc fails for knobs"); + colamd_set_defaults(knobs); + + if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) + ABORT("Malloc fails for A[]"); + if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) + ABORT("Malloc fails for p[]"); + for (i = 0; i <= n; ++i) p[i] = colptr[i]; + for (i = 0; i < nnz; ++i) A[i] = rowind[i]; + info = colamd(m, n, Alen, A, p, knobs); + if ( info == FALSE ) ABORT("COLAMD failed"); + + for (i = 0; i < n; ++i) perm_c[p[i]] = i; + + SUPERLU_FREE(knobs); + SUPERLU_FREE(A); + SUPERLU_FREE(p); +} + +void +getata( + const int m, /* number of rows in matrix A. */ + const int n, /* number of columns in matrix A. */ + const int nz, /* number of nonzeros in matrix A */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *atanz, /* out - on exit, returns the actual number of + nonzeros in matrix A'*A. */ + int **ata_colptr, /* out - size n+1 */ + int **ata_rowind /* out - size *atanz */ + ) +/* + * Purpose + * ======= + * + * Form the structure of A'*A. A is an m-by-n matrix in column oriented + * format represented by (colptr, rowind). The output A'*A is in column + * oriented format (symmetrically, also row oriented), represented by + * (ata_colptr, ata_rowind). + * + * This routine is modified from GETATA routine by Tim Davis. + * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2, + * i.e., the sum of the square of the row counts. + * + * Questions + * ========= + * o Do I need to withhold the *dense* rows? + * o How do I know the number of nonzeros in A'*A? + * + */ +{ + register int i, j, k, col, num_nz, ti, trow; + int *marker, *b_colptr, *b_rowind; + int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ + + if ( !(marker = (int*) SUPERLU_MALLOC((SUPERLU_MAX(m,n)+1)*sizeof(int))) ) + ABORT("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int*) SUPERLU_MALLOC((m+1) * sizeof(int))) ) + ABORT("SUPERLU_MALLOC t_colptr[]"); + if ( !(t_rowind = (int*) SUPERLU_MALLOC(nz * sizeof(int))) ) + ABORT("SUPERLU_MALLOC fails for t_rowind[]"); + + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < m; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) + ++marker[rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < m; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose the matrix from A to T */ + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) { + col = rowind[i]; + t_rowind[marker[col]] = j; + ++marker[col]; + } + + + /* ---------------------------------------------------------------- + compute B = T * A, where column j of B is: + + Struct (B_*j) = UNION ( Struct (T_*k) ) + A_kj != 0 + + do not include the diagonal entry + + ( Partition A as: A = (A_*1, ..., A_*n) + Then B = T * A = (T * A_*1, ..., T * A_*n), where + T * A_*j = (T_*1, ..., T_*m) * A_*j. ) + ---------------------------------------------------------------- */ + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* First pass determines number of nonzeros in B */ + num_nz = 0; + for (j = 0; j < n; ++j) { + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + for (i = colptr[j]; i < colptr[j+1]; ++i) { + /* A_kj is nonzero, add pattern of column T_*k to B_*j */ + k = rowind[i]; + for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { + trow = t_rowind[ti]; + if ( marker[trow] != j ) { + marker[trow] = j; + num_nz++; + } + } + } + } + *atanz = num_nz; + + /* Allocate storage for A'*A */ + if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails for ata_colptr[]"); + if ( *atanz ) { + if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails for ata_rowind[]"); + } + b_colptr = *ata_colptr; /* aliasing */ + b_rowind = *ata_rowind; + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* Compute each column of B, one at a time */ + num_nz = 0; + for (j = 0; j < n; ++j) { + b_colptr[j] = num_nz; + + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + for (i = colptr[j]; i < colptr[j+1]; ++i) { + /* A_kj is nonzero, add pattern of column T_*k to B_*j */ + k = rowind[i]; + for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { + trow = t_rowind[ti]; + if ( marker[trow] != j ) { + marker[trow] = j; + b_rowind[num_nz++] = trow; + } + } + } + } + b_colptr[n] = num_nz; + + SUPERLU_FREE(marker); + SUPERLU_FREE(t_colptr); + SUPERLU_FREE(t_rowind); +} + + +void +at_plus_a( + const int n, /* number of columns in matrix A. */ + const int nz, /* number of nonzeros in matrix A */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *bnz, /* out - on exit, returns the actual number of + nonzeros in matrix A'*A. */ + int **b_colptr, /* out - size n+1 */ + int **b_rowind /* out - size *bnz */ + ) +{ +/* + * Purpose + * ======= + * + * Form the structure of A'+A. A is an n-by-n matrix in column oriented + * format represented by (colptr, rowind). The output A'+A is in column + * oriented format (symmetrically, also row oriented), represented by + * (b_colptr, b_rowind). + * + */ + register int i, j, k, col, num_nz; + int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ + int *marker; + + if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails for t_colptr[]"); + if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails t_rowind[]"); + + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) + ++marker[rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose the matrix from A to T */ + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) { + col = rowind[i]; + t_rowind[marker[col]] = j; + ++marker[col]; + } + + + /* ---------------------------------------------------------------- + compute B = A + T, where column j of B is: + + Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k) + + do not include the diagonal entry + ---------------------------------------------------------------- */ + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* First pass determines number of nonzeros in B */ + num_nz = 0; + for (j = 0; j < n; ++j) { + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + /* Add pattern of column A_*k to B_*j */ + for (i = colptr[j]; i < colptr[j+1]; ++i) { + k = rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + ++num_nz; + } + } + + /* Add pattern of column T_*k to B_*j */ + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + k = t_rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + ++num_nz; + } + } + } + *bnz = num_nz; + + /* Allocate storage for A+A' */ + if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails for b_colptr[]"); + if ( *bnz) { + if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) ) + ABORT("SUPERLU_MALLOC fails for b_rowind[]"); + } + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* Compute each column of B, one at a time */ + num_nz = 0; + for (j = 0; j < n; ++j) { + (*b_colptr)[j] = num_nz; + + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + /* Add pattern of column A_*k to B_*j */ + for (i = colptr[j]; i < colptr[j+1]; ++i) { + k = rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + (*b_rowind)[num_nz++] = k; + } + } + + /* Add pattern of column T_*k to B_*j */ + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + k = t_rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + (*b_rowind)[num_nz++] = k; + } + } + } + (*b_colptr)[n] = num_nz; + + SUPERLU_FREE(marker); + SUPERLU_FREE(t_colptr); + SUPERLU_FREE(t_rowind); +} + +void +get_perm_c(int ispec, SuperMatrix *A, int *perm_c) +/* + * Purpose + * ======= + * + * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple + * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'. + * or using approximate minimum degree column ordering by Davis et. al. + * The LU factorization of A*Pc tends to have less fill than the LU + * factorization of A. + * + * Arguments + * ========= + * + * ispec (input) int + * Specifies the type of column ordering to reduce fill: + * = 1: minimum degree on the structure of A^T * A + * = 2: minimum degree on the structure of A^T + A + * = 3: approximate minimum degree for unsymmetric matrices + * If ispec == 0, the natural ordering (i.e., Pc = I) is returned. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A + * can be: Stype = NC; Dtype = _D; Mtype = GE. In the future, + * more general A can be handled. + * + * perm_c (output) int* + * Column permutation vector of size A->ncol, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * + */ +{ + NCformat *Astore = A->Store; + int m, n, bnz, *b_colptr, i; + int delta, maxint, nofsub, *invp; + int *b_rowind, *dhead, *qsize, *llist, *marker; + double t, SuperLU_timer_(); + + m = A->nrow; + n = A->ncol; + + t = SuperLU_timer_(); + switch ( ispec ) { + case 0: /* Natural ordering */ + for (i = 0; i < n; ++i) perm_c[i] = i; +#if ( PRNTlevel>=1 ) + printf("Use natural column ordering.\n"); +#endif + return; + case 1: /* Minimum degree ordering on A'*A */ + getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); +#if ( PRNTlevel>=1 ) + printf("Use minimum degree ordering on A'*A.\n"); +#endif + t = SuperLU_timer_() - t; + /*printf("Form A'*A time = %8.3f\n", t);*/ + break; + case 2: /* Minimum degree ordering on A'+A */ + if ( m != n ) ABORT("Matrix is not square"); + at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); +#if ( PRNTlevel>=1 ) + printf("Use minimum degree ordering on A'+A.\n"); +#endif + t = SuperLU_timer_() - t; + /*printf("Form A'+A time = %8.3f\n", t);*/ + break; + case 3: /* Approximate minimum degree column ordering. */ + get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind, + perm_c); +#if ( PRNTlevel>=1 ) + printf(".. Use approximate minimum degree column ordering.\n"); +#endif + return; + default: + ABORT("Invalid ISPEC"); + } + + if ( bnz != 0 ) { + t = SuperLU_timer_(); + + /* Initialize and allocate storage for GENMMD. */ + delta = 1; /* DELTA is a parameter to allow the choice of nodes + whose degree <= min-degree + DELTA. */ + maxint = 2147483647; /* 2**31 - 1 */ + invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp."); + dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead."); + qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize."); + llist = (int *) SUPERLU_MALLOC(n*sizeof(int)); + if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist."); + marker = (int *) SUPERLU_MALLOC(n*sizeof(int)); + if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker."); + + /* Transform adjacency list into 1-based indexing required by GENMMD.*/ + for (i = 0; i <= n; ++i) ++b_colptr[i]; + for (i = 0; i < bnz; ++i) ++b_rowind[i]; + + genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, + qsize, llist, marker, &maxint, &nofsub); + + /* Transform perm_c into 0-based indexing. */ + for (i = 0; i < n; ++i) --perm_c[i]; + + SUPERLU_FREE(b_colptr); + SUPERLU_FREE(b_rowind); + SUPERLU_FREE(invp); + SUPERLU_FREE(dhead); + SUPERLU_FREE(qsize); + SUPERLU_FREE(llist); + SUPERLU_FREE(marker); + + t = SuperLU_timer_() - t; + /* printf("call GENMMD time = %8.3f\n", t);*/ + + } else { /* Empty adjacency structure */ + for (i = 0; i < n; ++i) perm_c[i] = i; + } + +} diff --git a/intern/opennl/superlu/heap_relax_snode.c b/intern/opennl/superlu/heap_relax_snode.c new file mode 100644 index 00000000000..86971f59571 --- /dev/null +++ b/intern/opennl/superlu/heap_relax_snode.c @@ -0,0 +1,116 @@ +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" + +void +heap_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end /* last column in a supernode */ + ) +{ +/* + * Purpose + * ======= + * relax_snode() - Identify the initial relaxed supernodes, assuming that + * the matrix has been reordered according to the postorder of the etree. + * + */ + register int i, j, k, l, parent; + register int snode_start; /* beginning of a snode */ + int *et_save, *post, *inv_post, *iwork; + int nsuper_et = 0, nsuper_et_post = 0; + + /* The etree may not be postordered, but is heap ordered. */ + + iwork = (int*) intMalloc(3*n+2); + if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]"); + inv_post = iwork + n+1; + et_save = inv_post + n+1; + + /* Post order etree */ + post = (int *) TreePostorder(n, et); + for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) { + iwork[post[i]] = post[et[i]]; + et_save[i] = et[i]; /* Save the original etree */ + } + for (i = 0; i < n; ++i) et[i] = iwork[i]; + + /* Compute the number of descendants of each node in the etree */ + ifill (relax_end, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode in postordered etree; j is the last column. */ + ++nsuper_et_post; + k = n; + for (i = snode_start; i <= j; ++i) + k = SUPERLU_MIN(k, inv_post[i]); + l = inv_post[j]; + if ( (l - k) == (j - snode_start) ) { + /* It's also a supernode in the original etree */ + relax_end[k] = l; /* Last column is recorded */ + ++nsuper_et; + } else { + for (i = snode_start; i <= j; ++i) { + l = inv_post[i]; + if ( descendants[i] == 0 ) relax_end[l] = l; + } + } + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + +#if ( PRNTlevel>=1 ) + printf(".. heap_snode_relax:\n" + "\tNo of relaxed snodes in postordered etree:\t%d\n" + "\tNo of relaxed snodes in original etree:\t%d\n", + nsuper_et_post, nsuper_et); +#endif + + /* Recover the original etree */ + for (i = 0; i < n; ++i) et[i] = et_save[i]; + + SUPERLU_FREE(post); + SUPERLU_FREE(iwork); +} + + diff --git a/intern/opennl/superlu/lsame.c b/intern/opennl/superlu/lsame.c new file mode 100644 index 00000000000..29f27d38fa9 --- /dev/null +++ b/intern/opennl/superlu/lsame.c @@ -0,0 +1,70 @@ +int lsame_(char *ca, char *cb) +{ +/* -- LAPACK auxiliary routine (version 2.0) -- + Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + Courant Institute, Argonne National Lab, and Rice University + September 30, 1994 + + Purpose + ======= + + LSAME returns .TRUE. if CA is the same letter as CB regardless of case. + + Arguments + ========= + + CA (input) CHARACTER*1 + CB (input) CHARACTER*1 + CA and CB specify the single characters to be compared. + + ===================================================================== +*/ + + /* System generated locals */ + int ret_val; + + /* Local variables */ + int inta, intb, zcode; + + ret_val = *(unsigned char *)ca == *(unsigned char *)cb; + if (ret_val) { + return ret_val; + } + + /* Now test for equivalence if both characters are alphabetic. */ + + zcode = 'Z'; + + /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime + machines, on which ICHAR returns a value with bit 8 set. + ICHAR('A') on Prime machines returns 193 which is the same as + ICHAR('A') on an EBCDIC machine. */ + + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + /* ASCII is assumed - ZCODE is the ASCII code of either lower or + upper case 'Z'. */ + if (inta >= 97 && inta <= 122) inta += -32; + if (intb >= 97 && intb <= 122) intb += -32; + + } else if (zcode == 233 || zcode == 169) { + /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or + upper case 'Z'. */ + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta + >= 162 && inta <= 169)) + inta += 64; + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb + >= 162 && intb <= 169)) + intb += 64; + } else if (zcode == 218 || zcode == 250) { + /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code + plus 128 of either lower or upper case 'Z'. */ + if (inta >= 225 && inta <= 250) inta += -32; + if (intb >= 225 && intb <= 250) intb += -32; + } + ret_val = inta == intb; + return ret_val; + +} /* lsame_ */ diff --git a/intern/opennl/superlu/memory.c b/intern/opennl/superlu/memory.c new file mode 100644 index 00000000000..54d863ea9e9 --- /dev/null +++ b/intern/opennl/superlu/memory.c @@ -0,0 +1,207 @@ +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/** Precision-independent memory-related routines. + (Shared by [sdcz]memory.c) **/ + +#include "ssp_defs.h" + + +#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ +int superlu_malloc_total = 0; + +#define PAD_FACTOR 2 +#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ + +void *superlu_malloc(size_t size) +{ + char *buf; + + buf = (char *) malloc(size + DWORD); + if ( !buf ) { + printf("superlu_malloc fails: malloc_total %.0f MB, size %d\n", + superlu_malloc_total*1e-6, size); + ABORT("superlu_malloc: out of memory"); + } + + ((int_t *) buf)[0] = size; +#if 0 + superlu_malloc_total += size + DWORD; +#else + superlu_malloc_total += size; +#endif + return (void *) (buf + DWORD); +} + +void superlu_free(void *addr) +{ + char *p = ((char *) addr) - DWORD; + + if ( !addr ) + ABORT("superlu_free: tried to free NULL pointer"); + + if ( !p ) + ABORT("superlu_free: tried to free NULL+DWORD pointer"); + + { + int_t n = ((int_t *) p)[0]; + + if ( !n ) + ABORT("superlu_free: tried to free a freed pointer"); + *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ +#if 0 + superlu_malloc_total -= (n + DWORD); +#else + superlu_malloc_total -= n; +#endif + + if ( superlu_malloc_total < 0 ) + ABORT("superlu_malloc_total went negative!"); + + /*free (addr);*/ + free (p); + } + +} + +#else /* production mode */ + +void *superlu_malloc(size_t size) +{ + void *buf; + buf = (void *) malloc(size); + return (buf); +} + +void superlu_free(void *addr) +{ + free (addr); +} + +#endif + + +/* + * Set up pointers for integer working arrays. + */ +void +SetIWork(int m, int n, int panel_size, int *iworkptr, int **segrep, + int **parent, int **xplore, int **repfnz, int **panel_lsub, + int **xprune, int **marker) +{ + *segrep = iworkptr; + *parent = iworkptr + m; + *xplore = *parent + m; + *repfnz = *xplore + m; + *panel_lsub = *repfnz + panel_size * m; + *xprune = *panel_lsub + panel_size * m; + *marker = *xprune + n; + ifill (*repfnz, m * panel_size, EMPTY); + ifill (*panel_lsub, m * panel_size, EMPTY); +} + + +void +copy_mem_int(int howmany, void *old, void *new) +{ + register int i; + int *iold = old; + int *inew = new; + for (i = 0; i < howmany; i++) inew[i] = iold[i]; +} + + +void +user_bcopy(char *src, char *dest, int bytes) +{ + char *s_ptr, *d_ptr; + + s_ptr = src + bytes - 1; + d_ptr = dest + bytes - 1; + for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; +} + + + +int *intMalloc(int n) +{ + int *buf; + buf = (int *) SUPERLU_MALLOC(n * sizeof(int)); + if ( !buf ) { + ABORT("SUPERLU_MALLOC fails for buf in intMalloc()"); + } + return (buf); +} + +int *intCalloc(int n) +{ + int *buf; + register int i; + buf = (int *) SUPERLU_MALLOC(n * sizeof(int)); + if ( !buf ) { + ABORT("SUPERLU_MALLOC fails for buf in intCalloc()"); + } + for (i = 0; i < n; ++i) buf[i] = 0; + return (buf); +} + + + +#if 0 +check_expanders() +{ + int p; + printf("Check expanders:\n"); + for (p = 0; p < NO_MEMTYPE; p++) { + printf("type %d, size %d, mem %d\n", + p, expanders[p].size, (int)expanders[p].mem); + } + + return 0; +} + + +StackInfo() +{ + printf("Stack: size %d, used %d, top1 %d, top2 %d\n", + stack.size, stack.used, stack.top1, stack.top2); + return 0; +} + + + +PrintStack(char *msg, GlobalLU_t *Glu) +{ + int i; + int *xlsub, *lsub, *xusub, *usub; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + + printf("%s\n", msg); + +/* printf("\nUCOL: "); + for (i = 0; i < xusub[ndim]; ++i) + printf("%f ", ucol[i]); + + printf("\nLSUB: "); + for (i = 0; i < xlsub[ndim]; ++i) + printf("%d ", lsub[i]); + + printf("\nUSUB: "); + for (i = 0; i < xusub[ndim]; ++i) + printf("%d ", usub[i]); + + printf("\n");*/ + return 0; +} +#endif + + + diff --git a/intern/opennl/superlu/mmd.c b/intern/opennl/superlu/mmd.c new file mode 100644 index 00000000000..05f26ce0995 --- /dev/null +++ b/intern/opennl/superlu/mmd.c @@ -0,0 +1,1012 @@ + +typedef int shortint; + +/* *************************************************************** */ +/* *************************************************************** */ +/* **** GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE **** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */ +/* ALGORITHM. IT MAKES USE OF THE IMPLICIT REPRESENTATION */ +/* OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */ +/* NOTION OF INDISTINGUISHABLE NODES. IT ALSO IMPLEMENTS */ +/* THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */ +/* EXTERNAL DEGREE. */ +/* --------------------------------------------- */ +/* CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */ +/* DESTROYED. */ +/* --------------------------------------------- */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */ +/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ +/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */ +/* (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */ +/* NODES. */ + +/* OUTPUT PARAMETERS - */ +/* PERM - THE MINIMUM DEGREE ORDERING. */ +/* INVP - THE INVERSE OF PERM. */ +/* NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */ +/* SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */ + +/* WORKING PARAMETERS - */ +/* DHEAD - VECTOR FOR HEAD OF DEGREE LISTS. */ +/* INVP - USED TEMPORARILY FOR DEGREE FORWARD LINK. */ +/* PERM - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */ +/* QSIZE - VECTOR FOR SIZE OF SUPERNODES. */ +/* LLIST - VECTOR FOR TEMPORARY LINKED LISTS. */ +/* MARKER - A TEMPORARY MARKER VECTOR. */ + +/* PROGRAM SUBROUTINES - */ +/* MMDELM, MMDINT, MMDNUM, MMDUPD. */ + +/* *************************************************************** */ + +/* Subroutine */ int genmmd_(int *neqns, int *xadj, shortint *adjncy, + shortint *invp, shortint *perm, int *delta, shortint *dhead, + shortint *qsize, shortint *llist, shortint *marker, int *maxint, + int *nofsub) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int mdeg, ehead, i, mdlmt, mdnode; + extern /* Subroutine */ int mmdelm_(int *, int *, shortint *, + shortint *, shortint *, shortint *, shortint *, shortint *, + shortint *, int *, int *), mmdupd_(int *, int *, + int *, shortint *, int *, int *, shortint *, shortint + *, shortint *, shortint *, shortint *, shortint *, int *, + int *), mmdint_(int *, int *, shortint *, shortint *, + shortint *, shortint *, shortint *, shortint *, shortint *), + mmdnum_(int *, shortint *, shortint *, shortint *); + static int nextmd, tag, num; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dhead; + --perm; + --invp; + --adjncy; + --xadj; + + /* Function Body */ + if (*neqns <= 0) { + return 0; + } + +/* ------------------------------------------------ */ +/* INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */ +/* ------------------------------------------------ */ + *nofsub = 0; + mmdint_(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & + qsize[1], &llist[1], &marker[1]); + +/* ---------------------------------------------- */ +/* NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */ +/* ---------------------------------------------- */ + num = 1; + +/* ----------------------------- */ +/* ELIMINATE ALL ISOLATED NODES. */ +/* ----------------------------- */ + nextmd = dhead[1]; +L100: + if (nextmd <= 0) { + goto L200; + } + mdnode = nextmd; + nextmd = invp[mdnode]; + marker[mdnode] = *maxint; + invp[mdnode] = -num; + ++num; + goto L100; + +L200: +/* ---------------------------------------- */ +/* SEARCH FOR NODE OF THE MINIMUM DEGREE. */ +/* MDEG IS THE CURRENT MINIMUM DEGREE; */ +/* TAG IS USED TO FACILITATE MARKING NODES. */ +/* ---------------------------------------- */ + if (num > *neqns) { + goto L1000; + } + tag = 1; + dhead[1] = 0; + mdeg = 2; +L300: + if (dhead[mdeg] > 0) { + goto L400; + } + ++mdeg; + goto L300; +L400: +/* ------------------------------------------------- */ +/* USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */ +/* WHEN A DEGREE UPDATE IS TO BE PERFORMED. */ +/* ------------------------------------------------- */ + mdlmt = mdeg + *delta; + ehead = 0; + +L500: + mdnode = dhead[mdeg]; + if (mdnode > 0) { + goto L600; + } + ++mdeg; + if (mdeg > mdlmt) { + goto L900; + } + goto L500; +L600: +/* ---------------------------------------- */ +/* REMOVE MDNODE FROM THE DEGREE STRUCTURE. */ +/* ---------------------------------------- */ + nextmd = invp[mdnode]; + dhead[mdeg] = nextmd; + if (nextmd > 0) { + perm[nextmd] = -mdeg; + } + invp[mdnode] = -num; + *nofsub = *nofsub + mdeg + qsize[mdnode] - 2; + if (num + qsize[mdnode] > *neqns) { + goto L1000; + } +/* ---------------------------------------------- */ +/* ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */ +/* TRANSFORMATION. RESET TAG VALUE IF NECESSARY. */ +/* ---------------------------------------------- */ + ++tag; + if (tag < *maxint) { + goto L800; + } + tag = 1; + i__1 = *neqns; + for (i = 1; i <= i__1; ++i) { + if (marker[i] < *maxint) { + marker[i] = 0; + } +/* L700: */ + } +L800: + mmdelm_(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & + qsize[1], &llist[1], &marker[1], maxint, &tag); + num += qsize[mdnode]; + llist[mdnode] = ehead; + ehead = mdnode; + if (*delta >= 0) { + goto L500; + } +L900: +/* ------------------------------------------- */ +/* UPDATE DEGREES OF THE NODES INVOLVED IN THE */ +/* MINIMUM DEGREE NODES ELIMINATION. */ +/* ------------------------------------------- */ + if (num > *neqns) { + goto L1000; + } + mmdupd_(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], & + invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, &tag) + ; + goto L300; + +L1000: + mmdnum_(neqns, &perm[1], &invp[1], &qsize[1]); + return 0; + +} /* genmmd_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* *** MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION *** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */ +/* MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */ +/* ALGORITHM. */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ + +/* OUTPUT PARAMETERS - */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE (INITIALIZED TO ONE). */ +/* LLIST - LINKED LIST. */ +/* MARKER - MARKER VECTOR. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdint_(int *neqns, int *xadj, shortint *adjncy, + shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, + shortint *llist, shortint *marker) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int ndeg, node, fnode; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + dhead[node] = 0; + qsize[node] = 1; + marker[node] = 0; + llist[node] = 0; +/* L100: */ + } +/* ------------------------------------------ */ +/* INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */ +/* ------------------------------------------ */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + ndeg = xadj[node + 1] - xadj[node] + 1; + fnode = dhead[ndeg]; + dforw[node] = fnode; + dhead[ndeg] = node; + if (fnode > 0) { + dbakw[fnode] = node; + } + dbakw[node] = -ndeg; +/* L200: */ + } + return 0; + +} /* mmdint_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ** MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION *** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */ +/* MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */ +/* IS STORED IN THE QUOTIENT GRAPH FORMAT. IT ALSO */ +/* TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */ +/* ELIMINATION GRAPH. */ + +/* INPUT PARAMETERS - */ +/* MDNODE - NODE OF MINIMUM DEGREE. */ +/* MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */ +/* INT. */ +/* TAG - TAG VALUE. */ + +/* UPDATED PARAMETERS - */ +/* (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE. */ +/* MARKER - MARKER VECTOR. */ +/* LLIST - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdelm_(int *mdnode, int *xadj, shortint *adjncy, + shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, + shortint *llist, shortint *marker, int *maxint, int *tag) +{ + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, + istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + +/* ----------------------------------------------- */ +/* FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */ +/* ----------------------------------------------- */ + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + marker[*mdnode] = *tag; + istrt = xadj[*mdnode]; + istop = xadj[*mdnode + 1] - 1; +/* ------------------------------------------------------- */ +/* ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */ +/* NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */ +/* FOR THE NEXT REACHABLE NODE. */ +/* ------------------------------------------------------- */ + elmnt = 0; + rloc = istrt; + rlmt = istop; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + nabor = adjncy[i]; + if (nabor == 0) { + goto L300; + } + if (marker[nabor] >= *tag) { + goto L200; + } + marker[nabor] = *tag; + if (dforw[nabor] < 0) { + goto L100; + } + adjncy[rloc] = nabor; + ++rloc; + goto L200; +L100: + llist[nabor] = elmnt; + elmnt = nabor; +L200: + ; + } +L300: +/* ----------------------------------------------------- */ +/* MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */ +/* ----------------------------------------------------- */ + if (elmnt <= 0) { + goto L1000; + } + adjncy[rlmt] = -elmnt; + link = elmnt; +L400: + jstrt = xadj[link]; + jstop = xadj[link + 1] - 1; + i__1 = jstop; + for (j = jstrt; j <= i__1; ++j) { + node = adjncy[j]; + link = -node; + if (node < 0) { + goto L400; + } else if (node == 0) { + goto L900; + } else { + goto L500; + } +L500: + if (marker[node] >= *tag || dforw[node] < 0) { + goto L800; + } + marker[node] = *tag; +/* --------------------------------- */ +/* USE STORAGE FROM ELIMINATED NODES */ +/* IF NECESSARY. */ +/* --------------------------------- */ +L600: + if (rloc < rlmt) { + goto L700; + } + link = -adjncy[rlmt]; + rloc = xadj[link]; + rlmt = xadj[link + 1] - 1; + goto L600; +L700: + adjncy[rloc] = node; + ++rloc; +L800: + ; + } +L900: + elmnt = llist[elmnt]; + goto L300; +L1000: + if (rloc <= rlmt) { + adjncy[rloc] = 0; + } +/* -------------------------------------------------------- */ +/* FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */ +/* -------------------------------------------------------- */ + link = *mdnode; +L1100: + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + rnode = adjncy[i]; + link = -rnode; + if (rnode < 0) { + goto L1100; + } else if (rnode == 0) { + goto L1800; + } else { + goto L1200; + } +L1200: +/* -------------------------------------------- */ +/* IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */ +/* -------------------------------------------- */ + pvnode = dbakw[rnode]; + if (pvnode == 0 || pvnode == -(*maxint)) { + goto L1300; + } +/* ------------------------------------- */ +/* THEN REMOVE RNODE FROM THE STRUCTURE. */ +/* ------------------------------------- */ + nxnode = dforw[rnode]; + if (nxnode > 0) { + dbakw[nxnode] = pvnode; + } + if (pvnode > 0) { + dforw[pvnode] = nxnode; + } + npv = -pvnode; + if (pvnode < 0) { + dhead[npv] = nxnode; + } +L1300: +/* ---------------------------------------- */ +/* PURGE INACTIVE QUOTIENT NABORS OF RNODE. */ +/* ---------------------------------------- */ + jstrt = xadj[rnode]; + jstop = xadj[rnode + 1] - 1; + xqnbr = jstrt; + i__2 = jstop; + for (j = jstrt; j <= i__2; ++j) { + nabor = adjncy[j]; + if (nabor == 0) { + goto L1500; + } + if (marker[nabor] >= *tag) { + goto L1400; + } + adjncy[xqnbr] = nabor; + ++xqnbr; +L1400: + ; + } +L1500: +/* ---------------------------------------- */ +/* IF NO ACTIVE NABOR AFTER THE PURGING ... */ +/* ---------------------------------------- */ + nqnbrs = xqnbr - jstrt; + if (nqnbrs > 0) { + goto L1600; + } +/* ----------------------------- */ +/* THEN MERGE RNODE WITH MDNODE. */ +/* ----------------------------- */ + qsize[*mdnode] += qsize[rnode]; + qsize[rnode] = 0; + marker[rnode] = *maxint; + dforw[rnode] = -(*mdnode); + dbakw[rnode] = -(*maxint); + goto L1700; +L1600: +/* -------------------------------------- */ +/* ELSE FLAG RNODE FOR DEGREE UPDATE, AND */ +/* ADD MDNODE AS A NABOR OF RNODE. */ +/* -------------------------------------- */ + dforw[rnode] = nqnbrs + 1; + dbakw[rnode] = 0; + adjncy[xqnbr] = *mdnode; + ++xqnbr; + if (xqnbr <= jstop) { + adjncy[xqnbr] = 0; + } + +L1700: + ; + } +L1800: + return 0; + +} /* mmdelm_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ***** MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE ***** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */ +/* AFTER A MULTIPLE ELIMINATION STEP. */ + +/* INPUT PARAMETERS - */ +/* EHEAD - THE BEGINNING OF THE LIST OF ELIMINATED */ +/* NODES (I.E., NEWLY FORMED ELEMENTS). */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ +/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ +/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */ +/* INTEGER. */ + +/* UPDATED PARAMETERS - */ +/* MDEG - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE. */ +/* LLIST - WORKING LINKED LIST. */ +/* MARKER - MARKER VECTOR FOR DEGREE UPDATE. */ +/* TAG - TAG VALUE. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdupd_(int *ehead, int *neqns, int *xadj, + shortint *adjncy, int *delta, int *mdeg, shortint *dhead, + shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, + shortint *marker, int *maxint, int *tag) +{ + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, + istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + mdeg0 = *mdeg + *delta; + elmnt = *ehead; +L100: +/* ------------------------------------------------------- */ +/* FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */ +/* (RESET TAG VALUE IF NECESSARY.) */ +/* ------------------------------------------------------- */ + if (elmnt <= 0) { + return 0; + } + mtag = *tag + mdeg0; + if (mtag < *maxint) { + goto L300; + } + *tag = 1; + i__1 = *neqns; + for (i = 1; i <= i__1; ++i) { + if (marker[i] < *maxint) { + marker[i] = 0; + } +/* L200: */ + } + mtag = *tag + mdeg0; +L300: +/* --------------------------------------------- */ +/* CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */ +/* WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */ +/* ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */ +/* THAN TWO NABORS (QXHEAD). ALSO COMPUTE DEG0, */ +/* NUMBER OF NODES IN THIS ELEMENT. */ +/* --------------------------------------------- */ + q2head = 0; + qxhead = 0; + deg0 = 0; + link = elmnt; +L400: + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + enode = adjncy[i]; + link = -enode; + if (enode < 0) { + goto L400; + } else if (enode == 0) { + goto L800; + } else { + goto L500; + } + +L500: + if (qsize[enode] == 0) { + goto L700; + } + deg0 += qsize[enode]; + marker[enode] = mtag; +/* ---------------------------------- */ +/* IF ENODE REQUIRES A DEGREE UPDATE, */ +/* THEN DO THE FOLLOWING. */ +/* ---------------------------------- */ + if (dbakw[enode] != 0) { + goto L700; + } +/* --------------------------------------- +*/ +/* PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. +*/ +/* --------------------------------------- +*/ + if (dforw[enode] == 2) { + goto L600; + } + llist[enode] = qxhead; + qxhead = enode; + goto L700; +L600: + llist[enode] = q2head; + q2head = enode; +L700: + ; + } +L800: +/* -------------------------------------------- */ +/* FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */ +/* -------------------------------------------- */ + enode = q2head; + iq2 = 1; +L900: + if (enode <= 0) { + goto L1500; + } + if (dbakw[enode] != 0) { + goto L2200; + } + ++(*tag); + deg = deg0; +/* ------------------------------------------ */ +/* IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */ +/* ------------------------------------------ */ + istrt = xadj[enode]; + nabor = adjncy[istrt]; + if (nabor == elmnt) { + nabor = adjncy[istrt + 1]; + } +/* ------------------------------------------------ */ +/* IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */ +/* ------------------------------------------------ */ + link = nabor; + if (dforw[nabor] < 0) { + goto L1000; + } + deg += qsize[nabor]; + goto L2100; +L1000: +/* -------------------------------------------- */ +/* OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */ +/* DO THE FOLLOWING. */ +/* -------------------------------------------- */ + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + node = adjncy[i]; + link = -node; + if (node == enode) { + goto L1400; + } + if (node < 0) { + goto L1000; + } else if (node == 0) { + goto L2100; + } else { + goto L1100; + } + +L1100: + if (qsize[node] == 0) { + goto L1400; + } + if (marker[node] >= *tag) { + goto L1200; + } +/* ----------------------------------- +-- */ +/* CASE WHEN NODE IS NOT YET CONSIDERED +. */ +/* ----------------------------------- +-- */ + marker[node] = *tag; + deg += qsize[node]; + goto L1400; +L1200: +/* ---------------------------------------- + */ +/* CASE WHEN NODE IS INDISTINGUISHABLE FROM + */ +/* ENODE. MERGE THEM INTO A NEW SUPERNODE. + */ +/* ---------------------------------------- + */ + if (dbakw[node] != 0) { + goto L1400; + } + if (dforw[node] != 2) { + goto L1300; + } + qsize[enode] += qsize[node]; + qsize[node] = 0; + marker[node] = *maxint; + dforw[node] = -enode; + dbakw[node] = -(*maxint); + goto L1400; +L1300: +/* -------------------------------------- +*/ +/* CASE WHEN NODE IS OUTMATCHED BY ENODE. +*/ +/* -------------------------------------- +*/ + if (dbakw[node] == 0) { + dbakw[node] = -(*maxint); + } +L1400: + ; + } + goto L2100; +L1500: +/* ------------------------------------------------ */ +/* FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */ +/* ------------------------------------------------ */ + enode = qxhead; + iq2 = 0; +L1600: + if (enode <= 0) { + goto L2300; + } + if (dbakw[enode] != 0) { + goto L2200; + } + ++(*tag); + deg = deg0; +/* --------------------------------- */ +/* FOR EACH UNMARKED NABOR OF ENODE, */ +/* DO THE FOLLOWING. */ +/* --------------------------------- */ + istrt = xadj[enode]; + istop = xadj[enode + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + nabor = adjncy[i]; + if (nabor == 0) { + goto L2100; + } + if (marker[nabor] >= *tag) { + goto L2000; + } + marker[nabor] = *tag; + link = nabor; +/* ------------------------------ */ +/* IF UNELIMINATED, INCLUDE IT IN */ +/* DEG COUNT. */ +/* ------------------------------ */ + if (dforw[nabor] < 0) { + goto L1700; + } + deg += qsize[nabor]; + goto L2000; +L1700: +/* ------------------------------- +*/ +/* IF ELIMINATED, INCLUDE UNMARKED +*/ +/* NODES IN THIS ELEMENT INTO THE +*/ +/* DEGREE COUNT. */ +/* ------------------------------- +*/ + jstrt = xadj[link]; + jstop = xadj[link + 1] - 1; + i__2 = jstop; + for (j = jstrt; j <= i__2; ++j) { + node = adjncy[j]; + link = -node; + if (node < 0) { + goto L1700; + } else if (node == 0) { + goto L2000; + } else { + goto L1800; + } + +L1800: + if (marker[node] >= *tag) { + goto L1900; + } + marker[node] = *tag; + deg += qsize[node]; +L1900: + ; + } +L2000: + ; + } +L2100: +/* ------------------------------------------- */ +/* UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */ +/* STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */ +/* ------------------------------------------- */ + deg = deg - qsize[enode] + 1; + fnode = dhead[deg]; + dforw[enode] = fnode; + dbakw[enode] = -deg; + if (fnode > 0) { + dbakw[fnode] = enode; + } + dhead[deg] = enode; + if (deg < *mdeg) { + *mdeg = deg; + } +L2200: +/* ---------------------------------- */ +/* GET NEXT ENODE IN CURRENT ELEMENT. */ +/* ---------------------------------- */ + enode = llist[enode]; + if (iq2 == 1) { + goto L900; + } + goto L1600; +L2300: +/* ----------------------------- */ +/* GET NEXT ELEMENT IN THE LIST. */ +/* ----------------------------- */ + *tag = mtag; + elmnt = llist[elmnt]; + goto L100; + +} /* mmdupd_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ***** MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING ***** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */ +/* PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */ +/* VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */ +/* MINIMUM DEGREE ORDERING ALGORITHM. */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* QSIZE - SIZE OF SUPERNODES AT ELIMINATION. */ + +/* UPDATED PARAMETERS - */ +/* INVP - INVERSE PERMUTATION VECTOR. ON INPUT, */ +/* IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */ +/* INTO THE NODE -INVP(NODE); OTHERWISE, */ +/* -INVP(NODE) IS ITS INVERSE LABELLING. */ + +/* OUTPUT PARAMETERS - */ +/* PERM - THE PERMUTATION VECTOR. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdnum_(int *neqns, shortint *perm, shortint *invp, + shortint *qsize) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int node, root, nextf, father, nqsize, num; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --qsize; + --invp; + --perm; + + /* Function Body */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + nqsize = qsize[node]; + if (nqsize <= 0) { + perm[node] = invp[node]; + } + if (nqsize > 0) { + perm[node] = -invp[node]; + } +/* L100: */ + } +/* ------------------------------------------------------ */ +/* FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */ +/* ------------------------------------------------------ */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + if (perm[node] > 0) { + goto L500; + } +/* ----------------------------------------- */ +/* TRACE THE MERGED TREE UNTIL ONE WHICH HAS */ +/* NOT BEEN MERGED, CALL IT ROOT. */ +/* ----------------------------------------- */ + father = node; +L200: + if (perm[father] > 0) { + goto L300; + } + father = -perm[father]; + goto L200; +L300: +/* ----------------------- */ +/* NUMBER NODE AFTER ROOT. */ +/* ----------------------- */ + root = father; + num = perm[root] + 1; + invp[node] = -num; + perm[root] = num; +/* ------------------------ */ +/* SHORTEN THE MERGED TREE. */ +/* ------------------------ */ + father = node; +L400: + nextf = -perm[father]; + if (nextf <= 0) { + goto L500; + } + perm[father] = -root; + father = nextf; + goto L400; +L500: + ; + } +/* ---------------------- */ +/* READY TO COMPUTE PERM. */ +/* ---------------------- */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + num = -invp[node]; + invp[node] = num; + perm[num] = node; +/* L600: */ + } + return 0; + +} /* mmdnum_ */ + diff --git a/intern/opennl/superlu/relax_snode.c b/intern/opennl/superlu/relax_snode.c new file mode 100644 index 00000000000..549f3fcf873 --- /dev/null +++ b/intern/opennl/superlu/relax_snode.c @@ -0,0 +1,71 @@ +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" + +void +relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end /* last column in a supernode */ + ) +{ +/* + * Purpose + * ======= + * relax_snode() - Identify the initial relaxed supernodes, assuming that + * the matrix has been reordered according to the postorder of the etree. + * + */ + register int j, parent; + register int snode_start; /* beginning of a snode */ + + ifill (relax_end, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + + /* Compute the number of descendants of each node in the etree */ + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode with j being the last column. */ + relax_end[snode_start] = j; /* Last column is recorded */ + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + + /*printf("No of relaxed snodes: %d; relaxed columns: %d\n", + nsuper, no_relaxed_col); */ +} diff --git a/intern/opennl/superlu/scolumn_bmod.c b/intern/opennl/superlu/scolumn_bmod.c new file mode 100644 index 00000000000..c877a27dd53 --- /dev/null +++ b/intern/opennl/superlu/scolumn_bmod.c @@ -0,0 +1,353 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include +#include +#include "ssp_defs.h" + +/* + * Function prototypes + */ +void susolve(int, int, float*, float*); +void slsolve(int, int, float*, float*); +void smatvec(int, int, int, float*, float*, float*); + + + +/* Return value: 0 - successful return + * > 0 - number of bytes allocated when run out of space + */ +int +scolumn_bmod ( + const int jcol, /* in */ + const int nseg, /* in */ + float *dense, /* in */ + float *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in */ + int fpanelc, /* in -- first column in the current panel */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ +/* + * Purpose: + * ======== + * Performs numeric block updates (sup-col) in topological order. + * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. + * Special processing on the supernodal portion of L\U[*,j] + * + */ +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + +#ifdef USE_VENDOR_BLAS + int incx = 1, incy = 1; + float alpha, beta; +#endif + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in supernode + * nsupr = no of rows in supernode (used as leading dimension) + * luptr = location of supernodal LU-block in storage + * kfnz = first nonz in the k-th supernodal segment + * no_zeros = no of leading zeros in a supernodal U-segment + */ + float ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int fsupc, nsupc, nsupr, segsze; + int nrow; /* No of rows in the matrix of matrix-vector */ + int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; + register int lptr, kfnz, isub, irow, i; + register int no_zeros, new_next; + int ufirst, nextlu; + int fst_col; /* First column within small LU update */ + int d_fsupc; /* Distance between the first column of the current + panel and the first column of the current snode. */ + int *xsup, *supno; + int *lsub, *xlsub; + float *lusup; + int *xlusup; + int nzlumax; + float *tempv1; + float zero = 0.0; +#ifdef USE_VENDOR_BLAS + float one = 1.0; + float none = -1.0; +#endif + int mem_error; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + nzlumax = Glu->nzlumax; + jcolp1 = jcol + 1; + jsupno = supno[jcol]; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + + krep = segrep[k]; + k--; + ksupno = supno[krep]; + if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ + + fsupc = xsup[ksupno]; + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + /* Distance from the current supernode to the current panel; + d_fsupc=0 if fsupc > fpanelc. */ + d_fsupc = fst_col - fsupc; + + luptr = xlusup[fst_col] + d_fsupc; + lptr = xlsub[fsupc] + d_fsupc; + + kfnz = repfnz[krep]; + kfnz = SUPERLU_MAX ( kfnz, fpanelc ); + + segsze = krep - kfnz + 1; + nsupc = krep - fst_col + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nrow = nsupr - d_fsupc - nsupc; + krep_ind = lptr + nsupc - 1; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + + /* + * Case 1: Update U-segment of size 1 -- col-col update + */ + if ( segsze == 1 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + dense[irow] -= ukj*lusup[luptr]; + luptr++; + } + + } else if ( segsze <= 3 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { /* Case 2: 2cols-col update */ + ukj -= ukj1 * lusup[luptr1]; + dense[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + dense[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] ); + } + } else { /* Case 3: 3cols-col update */ + ukj2 = dense[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense[lsub[krep_ind]] = ukj; + dense[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + luptr2++; + dense[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + + + } else { + /* + * Case: sup-col update + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense + */ + + no_zeros = kfnz - fst_col; + + /* Copy U[*,j] segment from dense[*] to tempv[*] */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + tempv[i] = dense[irow]; + ++isub; + } + + /* Dense triangular solve -- start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + strsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + slsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); +#endif + + + /* Scatter tempv[] into SPA dense[] as a temporary storage */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense[irow] = tempv[i]; + tempv[i] = zero; + ++isub; + } + + /* Scatter tempv1[] into SPA dense[] */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + dense[irow] -= tempv1[i]; + tempv1[i] = zero; + ++isub; + } + } + + } /* if jsupno ... */ + + } /* for each segment... */ + + /* + * Process the supernodal portion of L\U[*,j] + */ + nextlu = xlusup[jcol]; + fsupc = xsup[jsupno]; + + /* Copy the SPA dense into L\U[*,j] */ + new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; + while ( new_next > nzlumax ) { + if ((mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))) + return (mem_error); + lusup = Glu->lusup; + lsub = Glu->lsub; + } + + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = zero; + ++nextlu; + } + + xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ + + /* For more updates within the panel (also within the current supernode), + * should start from the first column of the panel, or the first column + * of the supernode, whichever is bigger. There are 2 cases: + * 1) fsupc < fpanelc, then fst_col := fpanelc + * 2) fsupc >= fpanelc, then fst_col := fsupc + */ + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + if ( fst_col < jcol ) { + + /* Distance between the current supernode and the current panel. + d_fsupc=0 if fsupc >= fpanelc. */ + d_fsupc = fst_col - fsupc; + + lptr = xlsub[fsupc] + d_fsupc; + luptr = xlusup[fst_col] + d_fsupc; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nsupc = jcol - fst_col; /* Excluding jcol */ + nrow = nsupr - d_fsupc - nsupc; + + /* Points to the beginning of jcol in snode L\U(jsupno) */ + ufirst = xlusup[jcol] + d_fsupc; + + ops[TRSV] += nsupc * (nsupc - 1); + ops[GEMV] += 2 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#else + strsv_( "L", "N", "U", &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#endif + + alpha = none; beta = one; /* y := beta*y + alpha*A*x */ + +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + + smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], tempv ); + + /* Copy updates from tempv[*] into lusup[*] */ + isub = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + lusup[isub] -= tempv[i]; + tempv[i] = 0.0; + ++isub; + } + +#endif + + + } /* if fst_col < jcol ... */ + + return 0; +} diff --git a/intern/opennl/superlu/scolumn_dfs.c b/intern/opennl/superlu/scolumn_dfs.c new file mode 100644 index 00000000000..ecfb5c3b839 --- /dev/null +++ b/intern/opennl/superlu/scolumn_dfs.c @@ -0,0 +1,270 @@ + + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" + +/* What type of supernodes we want */ +#define T2_SUPER + +int +scolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *xprune, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* + * Purpose + * ======= + * "column_dfs" performs a symbolic factorization on column jcol, and + * decide the supernode boundary. + * + * This routine does not use numeric values, but only use the RHS + * row indices to start the dfs. + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. The routine returns a list of such supernodal + * representatives in topological order of the dfs that generates them. + * The location of the first nonzero in each such supernodal segment + * (supernodal entry location) is also returned. + * + * Local parameters + * ================ + * nseg: no of segments in current U[*,j] + * jsuper: jsuper=EMPTY if column j does not belong to the same + * supernode as j-1. Otherwise, jsuper=nsuper. + * + * marker2: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + * + * Return value + * ============ + * 0 success; + * > 0 number of bytes allocated when run out of space. + * + */ + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom, istop; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(3); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonz */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ((mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ((mem_error = + sLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu))) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + +#ifdef T2_SUPER + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; +#endif + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first and last columns of + * a supernode. (first for num values, last for pruning) + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + istop = ito + jptr - jm1ptr; + xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ + xlsub[jcol] = istop; + for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; /* = istop + length(jcol) */ + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xprune[jcol] = nextl; /* Initialize upper bound for pruning */ + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/intern/opennl/superlu/scopy_to_ucol.c b/intern/opennl/superlu/scopy_to_ucol.c new file mode 100644 index 00000000000..fd97352923f --- /dev/null +++ b/intern/opennl/superlu/scopy_to_ucol.c @@ -0,0 +1,105 @@ + + +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" +#include "util.h" + +int +scopy_to_ucol( + int jcol, /* in */ + int nseg, /* in */ + int *segrep, /* in */ + int *repfnz, /* in */ + int *perm_r, /* in */ + float *dense, /* modified - reset to zero on return */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* + * Gather from SPA dense[*] to global ucol[*]. + */ + int ksub, krep, ksupno; + int i, k, kfnz, segsze; + int fsupc, isub, irow; + int jsupno, nextu; + int new_next, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + float *ucol; + int *usub, *xusub; + int nzumax; + + float zero = 0.0; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + nzumax = Glu->nzumax; + + jsupno = supno[jcol]; + nextu = xusub[jcol]; + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + krep = segrep[k--]; + ksupno = supno[krep]; + + if ( ksupno != jsupno ) { /* Should go into ucol[] */ + kfnz = repfnz[krep]; + if ( kfnz != EMPTY ) { /* Nonzero U-segment */ + + fsupc = xsup[ksupno]; + isub = xlsub[fsupc] + kfnz - fsupc; + segsze = krep - kfnz + 1; + + new_next = nextu + segsze; + while ( new_next > nzumax ) { + if ((mem_error = sLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu))) + return (mem_error); + ucol = Glu->ucol; + if ((mem_error = sLUMemXpand(jcol, nextu, USUB, &nzumax, Glu))) + return (mem_error); + usub = Glu->usub; + lsub = Glu->lsub; + } + + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + usub[nextu] = perm_r[irow]; + ucol[nextu] = dense[irow]; + dense[irow] = zero; + nextu++; + isub++; + } + + } + + } + + } /* for each segment... */ + + xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ + return 0; +} diff --git a/intern/opennl/superlu/sgssv.c b/intern/opennl/superlu/sgssv.c new file mode 100644 index 00000000000..ede3dc83907 --- /dev/null +++ b/intern/opennl/superlu/sgssv.c @@ -0,0 +1,221 @@ + + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +#include "ssp_defs.h" + +void +sgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, + SuperLUStat_t *stat, int *info ) +{ +/* + * Purpose + * ======= + * + * SGSSV solves the system of linear equations A*X=B, using the + * LU factorization from SGSTRF. It performs the following steps: + * + * 1. If A is stored column-wise (A->Stype = SLU_NC): + * + * 1.1. Permute the columns of A, forming A*Pc, where Pc + * is a permutation matrix. For more details of this step, + * see sp_preorder.c. + * + * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined + * by Gaussian elimination with partial pivoting. + * L is unit lower triangular with offdiagonal entries + * bounded by 1 in magnitude, and U is upper triangular. + * + * 1.3. Solve the system of equations A*X=B using the factored + * form of A. + * + * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the + * above algorithm to the transpose of A: + * + * 2.1. Permute columns of transpose(A) (rows of A), + * forming transpose(A)*Pc, where Pc is a permutation matrix. + * For more details of this step, see sp_preorder.c. + * + * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr + * determined by Gaussian elimination with partial pivoting. + * L is unit lower triangular with offdiagonal entries + * bounded by 1 in magnitude, and U is upper triangular. + * + * 2.3. Solve the system of equations A*X=B using the factored + * form of A. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed and how the + * system will be solved. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of linear equations is A->nrow. Currently, the type of A can be: + * Stype = SLU_NC or SLU_NR; Dtype = SLU_S; Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * perm_c (input/output) int* + * If A->Stype = SLU_NC, column permutation vector of size A->ncol + * which defines the permutation matrix Pc; perm_c[i] = j means + * column i of A is in position j in A*Pc. + * If A->Stype = SLU_NR, column permutation vector of size A->nrow + * which describes permutation of columns of transpose(A) + * (rows of A) as described above. + * + * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or + * options->Fact = SamePattern_SameRowPerm, it is an input argument. + * On exit, perm_c may be overwritten by the product of the input + * perm_c and a permutation that postorders the elimination tree + * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree + * is already in postorder. + * Otherwise, it is an output argument. + * + * perm_r (input/output) int* + * If A->Stype = SLU_NC, row permutation vector of size A->nrow, + * which defines the permutation matrix Pr, and is determined + * by partial pivoting. perm_r[i] = j means row i of A is in + * position j in Pr*A. + * If A->Stype = SLU_NR, permutation vector of size A->ncol, which + * determines permutation of rows of transpose(A) + * (columns of A) as described above. + * + * If options->RowPerm = MY_PERMR or + * options->Fact = SamePattern_SameRowPerm, perm_r is an + * input argument. + * otherwise it is an output argument. + * + * L (output) SuperMatrix* + * The factor L from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses compressed row subscripts storage for supernodes, i.e., + * L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * On exit, the solution matrix if info = 0; + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly singular, + * so the solution could not be computed. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. + * + */ + DNformat *Bstore; + SuperMatrix *AA = NULL;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int lwork = 0, *etree, i; + + /* Set default values for some parameters */ + int panel_size; /* panel size */ + int relax; /* no of columns in a relaxed snodes */ + int permc_spec; + trans_t trans = NOTRANS; + double *utime; + double t; /* Temporary time */ + + /* Test the input parameters ... */ + *info = 0; + Bstore = B->Store; + if ( options->Fact != DOFACT ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) + *info = -7; + if ( *info != 0 ) { + i = -(*info); + xerbla_("sgssv", &i); + return; + } + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + trans = TRANS; + } else { + if ( A->Stype == SLU_NC ) AA = A; + } + + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t; + + etree = intMalloc(A->ncol); + + t = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + + /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4));*/ + t = SuperLU_timer_(); + /* Compute the LU factorization of A. */ + sgstrf(options, &AC, relax, panel_size, + etree, NULL, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t; + + t = SuperLU_timer_(); + if ( *info == 0 ) { + /* Solve the system A*X=B, overwriting B with X. */ + sgstrs (trans, L, U, perm_c, perm_r, B, stat, info); + } + utime[SOLVE] = SuperLU_timer_() - t; + + SUPERLU_FREE (etree); + Destroy_CompCol_Permuted(&AC); + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/intern/opennl/superlu/sgstrf.c b/intern/opennl/superlu/sgstrf.c new file mode 100644 index 00000000000..42f8dc9d0ee --- /dev/null +++ b/intern/opennl/superlu/sgstrf.c @@ -0,0 +1,433 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" + +void +sgstrf (superlu_options_t *options, SuperMatrix *A, + int relax, int panel_size, int *etree, void *work, int lwork, + int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, + SuperLUStat_t *stat, int *info) +{ +/* + * Purpose + * ======= + * + * SGSTRF computes an LU factorization of a general sparse m-by-n + * matrix A using partial pivoting with row interchanges. + * The factorization has the form + * Pr * A = L * U + * where Pr is a row permutation matrix, L is lower triangular with unit + * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper + * triangular (upper trapezoidal if A->nrow < A->ncol). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. + * + * drop_tol (input) float (NOT IMPLEMENTED) + * Drop tolerance parameter. At step j of the Gaussian elimination, + * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. + * 0 <= drop_tol <= 1. The default value of drop_tol is 0. + * + * relax (input) int + * To control degree of relaxing supernodes. If the number + * of nodes (columns) in a subtree of the elimination tree is less + * than relax, this subtree is considered as one supernode, + * regardless of the row structures of those columns. + * + * panel_size (input) int + * A panel consists of at most panel_size consecutive columns. + * + * etree (input) int*, dimension (A->ncol) + * Elimination tree of A'*A. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * On input, the columns of A should be permuted so that the + * etree is in a certain postorder. + * + * work (input/output) void*, size (lwork) (in bytes) + * User-supplied work space and space for the output data structures. + * Not referenced if lwork = 0; + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * *info; no other side effects. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * When searching for diagonal, perm_c[*] is applied to the + * row subscripts of A, so that diagonal threshold pivoting + * can find the diagonal of A, rather than that of A*Pc. + * + * perm_r (input/output) int*, dimension (A->nrow) + * Row permutation vector which defines the permutation matrix Pr, + * perm_r[i] = j means row i of A is in position j in Pr*A. + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by + * a new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument; + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = SLU_NC, + * Dtype = SLU_S, Mtype = SLU_TRU. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly singular, + * and division by zero will occur if it is used to solve a + * system of equations. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. If lwork = -1, it is + * the estimated amount of space needed, plus A->ncol. + * + * ====================================================================== + * + * Local Working Arrays: + * ====================== + * m = number of rows in the matrix + * n = number of columns in the matrix + * + * xprune[0:n-1]: xprune[*] points to locations in subscript + * vector lsub[*]. For column i, xprune[i] denotes the point where + * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need + * to be traversed for symbolic factorization. + * + * marker[0:3*m-1]: marker[i] = j means that node i has been + * reached when working on column j. + * Storage: relative to original row subscripts + * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, + * see spanel_dfs.c; marker2 is used for inner-factorization, + * see scolumn_dfs.c. + * + * parent[0:m-1]: parent vector used during dfs + * Storage: relative to new row subscripts + * + * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) + * unexplored neighbor of i in lsub[*] + * + * segrep[0:nseg-1]: contains the list of supernodal representatives + * in topological order of the dfs. A supernode representative is the + * last column of a supernode. + * The maximum size of segrep[] is n. + * + * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a + * supernodal representative r, repfnz[r] is the location of the first + * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 + * indicates the supernode r has been explored. + * NOTE: There are W of them, each used for one column of a panel. + * + * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below + * the panel diagonal. These are filled in during spanel_dfs(), and are + * used later in the inner LU factorization within the panel. + * panel_lsub[]/dense[] pair forms the SPA data structure. + * NOTE: There are W of them. + * + * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; + * NOTE: there are W of them. + * + * tempv[0:*]: real temporary used for dense numeric kernels; + * The size of this array is defined by NUM_TEMPV() in ssp_defs.h. + * + */ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; + used when options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *iwork; + float *swork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *xprune; + int *marker; + float *dense, *tempv; + int *relax_end; + float *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = sLUMemInit(fact, work, lwork, m, n, Astore->nnz, + panel_size, L, U, &Glu, &iwork, &swork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &xprune, &marker); + sSetRWork(m, panel_size, swork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + if ( options->SymmetricMode == YES ) { + heap_relax_snode(n, etree, relax, marker, relax_end); + } else { + relax_snode(n, etree, relax, marker, relax_end); + } + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = ssnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + xprune, marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ( (*info = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) + return; + } + + for (icol = jcol; icol<= kcol; icol++) { + xusub[icol+1] = nextu; + + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) + dense[asub[k]] = a[k]; + + /* Numeric update within the snode */ + ssnode_bmod(icol, fsupc, dense, tempv, &Glu, stat); + + if ( (*info = spivotL(icol, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + +#ifdef DEBUG + sprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); +#endif + + } + + jcol = icol; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + spanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, panel_lsub, segrep, repfnz, xprune, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + spanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for ( jj = jcol; jj < jcol + panel_size; jj++) { + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + if ((*info = scolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], + segrep, &repfnz[k], xprune, marker, + parent, xplore, &Glu)) != 0) return; + + /* Numeric updates */ + if ((*info = scolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Copy the U-segments to ucol[*] */ + if ((*info = scopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], &Glu)) != 0) + return; + + if ( (*info = spivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + + /* Prune columns (0:jj-1) using column jj */ + spruneL(jj, perm_r, pivrow, nseg, segrep, + &repfnz[k], xprune, &Glu); + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + +#ifdef DEBUG + sprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); +#endif + + } + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + sLUWorkFree(iwork, swork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + sCreate_SuperNode_Matrix(L, A->nrow, A->ncol, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_S, SLU_TRLU); + sCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_S, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + +} diff --git a/intern/opennl/superlu/sgstrs.c b/intern/opennl/superlu/sgstrs.c new file mode 100644 index 00000000000..5f7b9b57195 --- /dev/null +++ b/intern/opennl/superlu/sgstrs.c @@ -0,0 +1,331 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" + + +/* + * Function prototypes + */ +void susolve(int, int, float*, float*); +void slsolve(int, int, float*, float*); +void smatvec(int, int, int, float*, float*, float*); + + +void +sgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, SuperMatrix *B, + SuperLUStat_t *stat, int *info) +{ +/* + * Purpose + * ======= + * + * SGSTRS solves a system of linear equations A*X=B or A'*X=B + * with A sparse and B dense, using the LU factorization computed by + * SGSTRF. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * trans (input) trans_t + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A'* X = B (Transpose) + * = CONJ: A**H * X = B (Conjugate transpose) + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U as computed by + * sgstrf(). Use compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * sgstrf(). Use column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. + * + * perm_c (input) int*, dimension (L->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * + * perm_r (input) int*, dimension (L->nrow) + * Row permutation vector, which defines the permutation matrix Pr; + * perm_r[i] = j means row i of A is in position j in Pr*A. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * On exit, the solution matrix if info = 0; + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * + */ +#ifdef _CRAY + _fcd ftcs1, ftcs2, ftcs3, ftcs4; +#endif +#ifdef USE_VENDOR_BLAS + float alpha = 1.0, beta = 1.0; + float *work_col; +#endif + DNformat *Bstore; + float *Bmat; + SCformat *Lstore; + NCformat *Ustore; + float *Lval, *Uval; + int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; + int i, j, k, iptr, jcol, n, ldb, nrhs; + float *work, *rhs_work, *soln; + flops_t solve_ops; + void sprint_soln(); + + /* Test input parameters ... */ + *info = 0; + Bstore = B->Store; + ldb = Bstore->lda; + nrhs = B->ncol; + if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU ) + *info = -2; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU ) + *info = -3; + else if ( ldb < SUPERLU_MAX(0, L->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) + *info = -6; + if ( *info ) { + i = -(*info); + xerbla_("sgstrs", &i); + return; + } + + n = L->nrow; + work = floatCalloc(n * nrhs); + if ( !work ) ABORT("Malloc fails for local work[]."); + soln = floatMalloc(n); + if ( !soln ) ABORT("Malloc fails for local soln[]."); + + Bmat = Bstore->nzval; + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( trans == NOTRANS ) { + /* Permute right hand sides to form Pr*B */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + /* Forward solve PLy=Pb. */ + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + nrow = nsupr - nsupc; + + solve_ops += nsupc * (nsupc - 1) * nrhs; + solve_ops += 2 * nrow * nsupc * nrhs; + + if ( nsupc == 1 ) { + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + luptr = L_NZ_START(fsupc); + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ + irow = L_SUB(iptr); + ++luptr; + rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; + } + } + } else { + luptr = L_NZ_START(fsupc); +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); + STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#else + strsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + sgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#endif + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + work_col = &work[j*n]; + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + rhs_work[irow] -= work_col[i]; /* Scatter */ + work_col[i] = 0.0; + iptr++; + } + } +#else + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + slsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); + smatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], + &rhs_work[fsupc], &work[0] ); + + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + rhs_work[irow] -= work[i]; + work[i] = 0.0; + iptr++; + } + } +#endif + } /* else ... */ + } /* for L-solve */ + +#ifdef DEBUG + printf("After L-solve: y=\n"); + sprint_soln(n, Bmat); +#endif + + /* + * Back solve Ux=y. + */ + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += nsupc * (nsupc + 1) * nrhs; + + if ( nsupc == 1 ) { + rhs_work = &Bmat[0]; + for (j = 0; j < nrhs; j++) { + rhs_work[fsupc] /= Lval[luptr]; + rhs_work += ldb; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("U", strlen("U")); + ftcs3 = _cptofcd("N", strlen("N")); + STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#else + strsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#endif +#else + for (j = 0; j < nrhs; j++) + susolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); +#endif + } + + for (j = 0; j < nrhs; ++j) { + rhs_work = &Bmat[j*ldb]; + for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ + irow = U_SUB(i); + rhs_work[irow] -= rhs_work[jcol] * Uval[i]; + } + } + } + + } /* for U-solve */ + +#ifdef DEBUG + printf("After U-solve: x=\n"); + sprint_soln(n, Bmat); +#endif + + /* Compute the final solution X := Pc*X. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = solve_ops; + + } else { /* Solve A'*X=B or CONJ(A)*X=B */ + /* Permute right hand sides to form Pc'*B. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = 0; + for (k = 0; k < nrhs; ++k) { + + /* Multiply by inv(U'). */ + sp_strsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by inv(L'). */ + sp_strsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); + + } + /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + } + + SUPERLU_FREE(work); + SUPERLU_FREE(soln); +} + +/* + * Diagnostic print of the solution vector + */ +void +sprint_soln(int n, float *soln) +{ + int i; + + for (i = 0; i < n; i++) + printf("\t%d: %.4f\n", i, soln[i]); +} diff --git a/intern/opennl/superlu/smemory.c b/intern/opennl/superlu/smemory.c new file mode 100644 index 00000000000..79da748671a --- /dev/null +++ b/intern/opennl/superlu/smemory.c @@ -0,0 +1,676 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +#include "ssp_defs.h" + +/* Constants */ +#define NO_MEMTYPE 4 /* 0: lusup; + 1: ucol; + 2: lsub; + 3: usub */ +#define GluIntArray(n) (5 * (n) + 5) + +/* Internal prototypes */ +void *sexpand (int *, MemType,int, int, GlobalLU_t *); +int sLUWorkInit (int, int, int, int **, float **, LU_space_t); +void copy_mem_float (int, void *, void *); +void sStackCompress (GlobalLU_t *); +void sSetupSpace (void *, int, LU_space_t *); +void *suser_malloc (int, int); +void suser_free (int, int); + +/* External prototypes (in memory.c - prec-indep) */ +extern void copy_mem_int (int, void *, void *); +extern void user_bcopy (char *, char *, int); + +/* Headers for 4 types of dynamatically managed memory */ +typedef struct e_node { + int size; /* length of the memory that has been used */ + void *mem; /* pointer to the new malloc'd store */ +} ExpHeader; + +typedef struct { + int size; + int used; + int top1; /* grow upward, relative to &array[0] */ + int top2; /* grow downward */ + void *array; +} LU_stack_t; + +/* Variables local to this file */ +static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */ +static LU_stack_t stack; +static int no_expand; + +/* Macros to manipulate stack */ +#define StackFull(x) ( x + stack.used >= stack.size ) +#define NotDoubleAlign(addr) ( (long int)addr & 7 ) +#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) +#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ + (w + 1) * m * sizeof(float) ) +#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ + + + + +/* + * Setup the memory model to be used for factorization. + * lwork = 0: use system malloc; + * lwork > 0: use user-supplied work[] space. + */ +void sSetupSpace(void *work, int lwork, LU_space_t *MemModel) +{ + if ( lwork == 0 ) { + *MemModel = SYSTEM; /* malloc/free */ + } else if ( lwork > 0 ) { + *MemModel = USER; /* user provided space */ + stack.used = 0; + stack.top1 = 0; + stack.top2 = (lwork/4)*4; /* must be word addressable */ + stack.size = stack.top2; + stack.array = (void *) work; + } +} + + + +void *suser_malloc(int bytes, int which_end) +{ + void *buf; + + if ( StackFull(bytes) ) return (NULL); + + if ( which_end == HEAD ) { + buf = (char*) stack.array + stack.top1; + stack.top1 += bytes; + } else { + stack.top2 -= bytes; + buf = (char*) stack.array + stack.top2; + } + + stack.used += bytes; + return buf; +} + + +void suser_free(int bytes, int which_end) +{ + if ( which_end == HEAD ) { + stack.top1 -= bytes; + } else { + stack.top2 += bytes; + } + stack.used -= bytes; +} + + + +/* + * mem_usage consists of the following fields: + * - for_lu (float) + * The amount of space used in bytes for the L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + * - expansions (int) + * Number of memory expansions during the LU factorization. + */ +int sQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, iword, dword, panel_size = sp_ienv(1); + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(float); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] * + dword + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2 * panel_size + 4 + NO_MARKER) * n * iword + + (panel_size + 1) * n * dword ); + + mem_usage->expansions = --no_expand; + + return 0; +} /* sQuerySpace */ + +/* + * Allocate storage for the data structures common to all factor routines. + * For those unpredictable size, make a guess as FILL * nnz(A). + * Return value: + * If lwork = -1, return the estimated amount of space required, plus n; + * otherwise, return the amount of space actually allocated when + * memory allocation failure occurred. + */ +int +sLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, + int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, + int **iwork, float **dwork) +{ + int info, iword, dword; + SCformat *Lstore; + NCformat *Ustore; + int *xsup, *supno; + int *lsub, *xlsub; + float *lusup; + int *xlusup; + float *ucol; + int *usub, *xusub; + int nzlmax, nzumax, nzlumax; + int FILL = sp_ienv(6); + + Glu->n = n; + no_expand = 0; + iword = sizeof(int); + dword = sizeof(float); + + if ( !expanders ) + expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader)); + if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); + + if ( fact != SamePattern_SameRowPerm ) { + /* Guess for L\U factors */ + nzumax = nzlumax = FILL * annz; + nzlmax = SUPERLU_MAX(1, FILL/4.) * annz; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else { + sSetupSpace(work, lwork, &Glu->MemModel); + } + +#ifdef DEBUG + printf("sLUMemInit() called: annz %d, MemModel %d\n", + annz, Glu->MemModel); +#endif + + /* Integer pointers for L\U factors */ + if ( Glu->MemModel == SYSTEM ) { + xsup = intMalloc(n+1); + supno = intMalloc(n+1); + xlsub = intMalloc(n+1); + xlusup = intMalloc(n+1); + xusub = intMalloc(n+1); + } else { + xsup = (int *)suser_malloc((n+1) * iword, HEAD); + supno = (int *)suser_malloc((n+1) * iword, HEAD); + xlsub = (int *)suser_malloc((n+1) * iword, HEAD); + xlusup = (int *)suser_malloc((n+1) * iword, HEAD); + xusub = (int *)suser_malloc((n+1) * iword, HEAD); + } + + lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) sexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) sexpand( &nzumax, USUB, 0, 1, Glu ); + + while ( !lusup || !ucol || !lsub || !usub ) { + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE(lusup); + SUPERLU_FREE(ucol); + SUPERLU_FREE(lsub); + SUPERLU_FREE(usub); + } else { + suser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD); + } + nzlumax /= 2; + nzumax /= 2; + nzlmax /= 2; + if ( nzlumax < annz ) { + printf("Not enough memory to perform factorization.\n"); + return (smemory_usage(nzlmax, nzumax, nzlumax, n) + n); + } + lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) sexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) sexpand( &nzumax, USUB, 0, 1, Glu ); + } + + } else { + /* fact == SamePattern_SameRowPerm */ + Lstore = L->Store; + Ustore = U->Store; + xsup = Lstore->sup_to_col; + supno = Lstore->col_to_sup; + xlsub = Lstore->rowind_colptr; + xlusup = Lstore->nzval_colptr; + xusub = Ustore->colptr; + nzlmax = Glu->nzlmax; /* max from previous factorization */ + nzumax = Glu->nzumax; + nzlumax = Glu->nzlumax; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; + } else { + Glu->MemModel = USER; + stack.top2 = (lwork/4)*4; /* must be word-addressable */ + stack.size = stack.top2; + } + + lsub = expanders[LSUB].mem = Lstore->rowind; + lusup = expanders[LUSUP].mem = Lstore->nzval; + usub = expanders[USUB].mem = Ustore->rowind; + ucol = expanders[UCOL].mem = Ustore->nzval;; + expanders[LSUB].size = nzlmax; + expanders[LUSUP].size = nzlumax; + expanders[USUB].size = nzumax; + expanders[UCOL].size = nzumax; + } + + Glu->xsup = xsup; + Glu->supno = supno; + Glu->lsub = lsub; + Glu->xlsub = xlsub; + Glu->lusup = lusup; + Glu->xlusup = xlusup; + Glu->ucol = ucol; + Glu->usub = usub; + Glu->xusub = xusub; + Glu->nzlmax = nzlmax; + Glu->nzumax = nzumax; + Glu->nzlumax = nzlumax; + + info = sLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel); + if ( info ) + return ( info + smemory_usage(nzlmax, nzumax, nzlumax, n) + n); + + ++no_expand; + return 0; + +} /* sLUMemInit */ + +/* Allocate known working storage. Returns 0 if success, otherwise + returns the number of bytes allocated so far when failure occurred. */ +int +sLUWorkInit(int m, int n, int panel_size, int **iworkptr, + float **dworkptr, LU_space_t MemModel) +{ + int isize, dsize, extra; + float *old_ptr; + int maxsuper = sp_ienv(3), + rowblk = sp_ienv(4); + + isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); + dsize = (m * panel_size + + NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(float); + + if ( MemModel == SYSTEM ) + *iworkptr = (int *) intCalloc(isize/sizeof(int)); + else + *iworkptr = (int *) suser_malloc(isize, TAIL); + if ( ! *iworkptr ) { + fprintf(stderr, "sLUWorkInit: malloc fails for local iworkptr[]\n"); + return (isize + n); + } + + if ( MemModel == SYSTEM ) + *dworkptr = (float *) SUPERLU_MALLOC(dsize); + else { + *dworkptr = (float *) suser_malloc(dsize, TAIL); + if ( NotDoubleAlign(*dworkptr) ) { + old_ptr = *dworkptr; + *dworkptr = (float*) DoubleAlign(*dworkptr); + *dworkptr = (float*) ((double*)*dworkptr - 1); + extra = (char*)old_ptr - (char*)*dworkptr; +#ifdef DEBUG + printf("sLUWorkInit: not aligned, extra %d\n", extra); +#endif + stack.top2 -= extra; + stack.used += extra; + } + } + if ( ! *dworkptr ) { + fprintf(stderr, "malloc fails for local dworkptr[]."); + return (isize + dsize + n); + } + + return 0; +} + + +/* + * Set up pointers for real working arrays. + */ +void +sSetRWork(int m, int panel_size, float *dworkptr, + float **dense, float **tempv) +{ + float zero = 0.0; + + int maxsuper = sp_ienv(3), + rowblk = sp_ienv(4); + *dense = dworkptr; + *tempv = *dense + panel_size*m; + sfill (*dense, m * panel_size, zero); + sfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); +} + +/* + * Free the working storage used by factor routines. + */ +void sLUWorkFree(int *iwork, float *dwork, GlobalLU_t *Glu) +{ + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE (iwork); + SUPERLU_FREE (dwork); + } else { + stack.used -= (stack.size - stack.top2); + stack.top2 = stack.size; +/* sStackCompress(Glu); */ + } + + SUPERLU_FREE (expanders); + expanders = 0; +} + +/* Expand the data structures for L and U during the factorization. + * Return value: 0 - successful return + * > 0 - number of bytes allocated when run out of space + */ +int +sLUMemXpand(int jcol, + int next, /* number of elements currently in the factors */ + MemType mem_type, /* which type of memory to expand */ + int *maxlen, /* modified - maximum length of a data structure */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + void *new_mem; + +#ifdef DEBUG + printf("sLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", + jcol, next, *maxlen, mem_type); +#endif + + if (mem_type == USUB) + new_mem = sexpand(maxlen, mem_type, next, 1, Glu); + else + new_mem = sexpand(maxlen, mem_type, next, 0, Glu); + + if ( !new_mem ) { + int nzlmax = Glu->nzlmax; + int nzumax = Glu->nzumax; + int nzlumax = Glu->nzlumax; + fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); + return (smemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); + } + + switch ( mem_type ) { + case LUSUP: + Glu->lusup = (float *) new_mem; + Glu->nzlumax = *maxlen; + break; + case UCOL: + Glu->ucol = (float *) new_mem; + Glu->nzumax = *maxlen; + break; + case LSUB: + Glu->lsub = (int *) new_mem; + Glu->nzlmax = *maxlen; + break; + case USUB: + Glu->usub = (int *) new_mem; + Glu->nzumax = *maxlen; + break; + } + + return 0; + +} + + + +void +copy_mem_float(int howmany, void *old, void *new) +{ + register int i; + float *dold = old; + float *dnew = new; + for (i = 0; i < howmany; i++) dnew[i] = dold[i]; +} + +/* + * Expand the existing storage to accommodate more fill-ins. + */ +void +*sexpand ( + int *prev_len, /* length used from previous call */ + MemType type, /* which part of the memory to expand */ + int len_to_copy, /* size of the memory to be copied to new store */ + int keep_prev, /* = 1: use prev_len; + = 0: compute new_len to expand */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + float EXPAND = 1.5; + float alpha; + void *new_mem, *old_mem; + int new_len, tries, lword, extra, bytes_to_copy; + + alpha = EXPAND; + + if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ + new_len = *prev_len; + else { + new_len = alpha * *prev_len; + } + + if ( type == LSUB || type == USUB ) lword = sizeof(int); + else lword = sizeof(float); + + if ( Glu->MemModel == SYSTEM ) { + new_mem = (void *) SUPERLU_MALLOC(new_len * lword); +/* new_mem = (void *) calloc(new_len, lword); */ + if ( no_expand != 0 ) { + tries = 0; + if ( keep_prev ) { + if ( !new_mem ) return (NULL); + } else { + while ( !new_mem ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + new_mem = (void *) SUPERLU_MALLOC(new_len * lword); +/* new_mem = (void *) calloc(new_len, lword); */ + } + } + if ( type == LSUB || type == USUB ) { + copy_mem_int(len_to_copy, expanders[type].mem, new_mem); + } else { + copy_mem_float(len_to_copy, expanders[type].mem, new_mem); + } + SUPERLU_FREE (expanders[type].mem); + } + expanders[type].mem = (void *) new_mem; + + } else { /* MemModel == USER */ + if ( no_expand == 0 ) { + new_mem = suser_malloc(new_len * lword, HEAD); + if ( NotDoubleAlign(new_mem) && + (type == LUSUP || type == UCOL) ) { + old_mem = new_mem; + new_mem = (void *)DoubleAlign(new_mem); + extra = (char*)new_mem - (char*)old_mem; +#ifdef DEBUG + printf("expand(): not aligned, extra %d\n", extra); +#endif + stack.top1 += extra; + stack.used += extra; + } + expanders[type].mem = (void *) new_mem; + } + else { + tries = 0; + extra = (new_len - *prev_len) * lword; + if ( keep_prev ) { + if ( StackFull(extra) ) return (NULL); + } else { + while ( StackFull(extra) ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + extra = (new_len - *prev_len) * lword; + } + } + + if ( type != USUB ) { + new_mem = (void*)((char*)expanders[type + 1].mem + extra); + bytes_to_copy = (char*)stack.array + stack.top1 + - (char*)expanders[type + 1].mem; + user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); + + if ( type < USUB ) { + Glu->usub = expanders[USUB].mem = + (void*)((char*)expanders[USUB].mem + extra); + } + if ( type < LSUB ) { + Glu->lsub = expanders[LSUB].mem = + (void*)((char*)expanders[LSUB].mem + extra); + } + if ( type < UCOL ) { + Glu->ucol = expanders[UCOL].mem = + (void*)((char*)expanders[UCOL].mem + extra); + } + stack.top1 += extra; + stack.used += extra; + if ( type == UCOL ) { + stack.top1 += extra; /* Add same amount for USUB */ + stack.used += extra; + } + + } /* if ... */ + + } /* else ... */ + } + + expanders[type].size = new_len; + *prev_len = new_len; + if ( no_expand ) ++no_expand; + + return (void *) expanders[type].mem; + +} /* sexpand */ + + +/* + * Compress the work[] array to remove fragmentation. + */ +void +sStackCompress(GlobalLU_t *Glu) +{ + register int iword, dword, ndim; + char *last, *fragment; + int *ifrom, *ito; + float *dfrom, *dto; + int *xlsub, *lsub, *xusub, *usub, *xlusup; + float *ucol, *lusup; + + iword = sizeof(int); + dword = sizeof(float); + ndim = Glu->n; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + lusup = Glu->lusup; + + dfrom = ucol; + dto = (float *)((char*)lusup + xlusup[ndim] * dword); + copy_mem_float(xusub[ndim], dfrom, dto); + ucol = dto; + + ifrom = lsub; + ito = (int *) ((char*)ucol + xusub[ndim] * iword); + copy_mem_int(xlsub[ndim], ifrom, ito); + lsub = ito; + + ifrom = usub; + ito = (int *) ((char*)lsub + xlsub[ndim] * iword); + copy_mem_int(xusub[ndim], ifrom, ito); + usub = ito; + + last = (char*)usub + xusub[ndim] * iword; + fragment = (char*) (((char*)stack.array + stack.top1) - last); + stack.used -= (long int) fragment; + stack.top1 -= (long int) fragment; + + Glu->ucol = ucol; + Glu->lsub = lsub; + Glu->usub = usub; + +#ifdef DEBUG + printf("sStackCompress: fragment %d\n", fragment); + /* for (last = 0; last < ndim; ++last) + print_lu_col("After compress:", last, 0);*/ +#endif + +} + +/* + * Allocate storage for original matrix A + */ +void +sallocateA(int n, int nnz, float **a, int **asub, int **xa) +{ + *a = (float *) floatMalloc(nnz); + *asub = (int *) intMalloc(nnz); + *xa = (int *) intMalloc(n+1); +} + + +float *floatMalloc(int n) +{ + float *buf; + buf = (float *) SUPERLU_MALLOC(n * sizeof(float)); + if ( !buf ) { + ABORT("SUPERLU_MALLOC failed for buf in floatMalloc()\n"); + } + return (buf); +} + +float *floatCalloc(int n) +{ + float *buf; + register int i; + float zero = 0.0; + buf = (float *) SUPERLU_MALLOC(n * sizeof(float)); + if ( !buf ) { + ABORT("SUPERLU_MALLOC failed for buf in floatCalloc()\n"); + } + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + + +int smemory_usage(const int nzlmax, const int nzumax, + const int nzlumax, const int n) +{ + register int iword, dword; + + iword = sizeof(int); + dword = sizeof(float); + + return (10 * n * iword + + nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); + +} diff --git a/intern/opennl/superlu/smyblas2.c b/intern/opennl/superlu/smyblas2.c new file mode 100644 index 00000000000..729e17f7674 --- /dev/null +++ b/intern/opennl/superlu/smyblas2.c @@ -0,0 +1,225 @@ + + +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + * File name: smyblas2.c + * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + */ + +/* + * Solves a dense UNIT lower triangular system. The unit lower + * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). + * The solution will be returned in the rhs vector. + */ +void slsolve ( int ldm, int ncol, float *M, float *rhs ) +{ + int k; + float x0, x1, x2, x3, x4, x5, x6, x7; + float *M0; + register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + + M0 = &M[0]; + + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + Mki4 = Mki3 + ldm + 1; + Mki5 = Mki4 + ldm + 1; + Mki6 = Mki5 + ldm + 1; + Mki7 = Mki6 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++; + x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++; + x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; + x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + rhs[++firstcol] = x4; + rhs[++firstcol] = x5; + rhs[++firstcol] = x6; + rhs[++firstcol] = x7; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++ + - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++ - x7 * *Mki7++; + + M0 += 8 * ldm + 8; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++; + + M0 += 4 * ldm + 4; + } + + if ( firstcol < ncol - 1 ) { /* Do 2 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + + rhs[++firstcol] = x1; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; + + } + +} + +/* + * Solves a dense upper triangular system. The upper triangular matrix is + * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned + * in the rhs vector. + */ +void +susolve ( ldm, ncol, M, rhs ) +int ldm; /* in */ +int ncol; /* in */ +float *M; /* in */ +float *rhs; /* modified */ +{ + float xj; + int jcol, j, irow; + + jcol = ncol - 1; + + for (j = 0; j < ncol; j++) { + + xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ + rhs[jcol] = xj; + + for (irow = 0; irow < jcol; irow++) + rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ + + jcol--; + + } +} + + +/* + * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. + * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. + */ +void smatvec ( ldm, nrow, ncol, M, vec, Mxvec ) + +int ldm; /* in -- leading dimension of M */ +int nrow; /* in */ +int ncol; /* in */ +float *M; /* in */ +float *vec; /* in */ +float *Mxvec; /* in/out */ + +{ + float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; + float *M0; + register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + int k; + + M0 = &M[0]; + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + Mki4 = Mki3 + ldm; + Mki5 = Mki4 + ldm; + Mki6 = Mki5 + ldm; + Mki7 = Mki6 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + vi4 = vec[firstcol++]; + vi5 = vec[firstcol++]; + vi6 = vec[firstcol++]; + vi7 = vec[firstcol++]; + + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ + + vi4 * *Mki4++ + vi5 * *Mki5++ + + vi6 * *Mki6++ + vi7 * *Mki7++; + + M0 += 8 * ldm; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ ; + + M0 += 4 * ldm; + } + + while ( firstcol < ncol ) { /* Do 1 column */ + + Mki0 = M0; + vi0 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++; + + M0 += ldm; + } + +} + diff --git a/intern/opennl/superlu/sp_coletree.c b/intern/opennl/superlu/sp_coletree.c new file mode 100644 index 00000000000..d49919167f5 --- /dev/null +++ b/intern/opennl/superlu/sp_coletree.c @@ -0,0 +1,332 @@ + +/* Elimination tree computation and layout routines */ + +#include +#include +#include "ssp_defs.h" + +/* + * Implementation of disjoint set union routines. + * Elements are integers in 0..n-1, and the + * names of the sets themselves are of type int. + * + * Calls are: + * initialize_disjoint_sets (n) initial call. + * s = make_set (i) returns a set containing only i. + * s = link (t, u) returns s = t union u, destroying t and u. + * s = find (i) return name of set containing i. + * finalize_disjoint_sets final call. + * + * This implementation uses path compression but not weighted union. + * See Tarjan's book for details. + * John Gilbert, CMI, 1987. + * + * Implemented path-halving by XSL 07/05/95. + */ + +static int *pp; /* parent array for sets */ + +static +int *mxCallocInt(int n) +{ + register int i; + int *buf; + + buf = (int *) SUPERLU_MALLOC( n * sizeof(int) ); + if ( !buf ) { + ABORT("SUPERLU_MALLOC fails for buf in mxCallocInt()"); + } + for (i = 0; i < n; i++) buf[i] = 0; + return (buf); +} + +static +void initialize_disjoint_sets ( + int n + ) +{ + pp = mxCallocInt(n); +} + + +static +int make_set ( + int i + ) +{ + pp[i] = i; + return i; +} + + +static +int link ( + int s, + int t + ) +{ + pp[s] = t; + return t; +} + + +/* PATH HALVING */ +static +int find (int i) +{ + register int p, gp; + + p = pp[i]; + gp = pp[p]; + while (gp != p) { + pp[i] = gp; + i = gp; + p = pp[i]; + gp = pp[p]; + } + return (p); +} + +#if 0 +/* PATH COMPRESSION */ +static +int find ( + int i + ) +{ + if (pp[i] != i) + pp[i] = find (pp[i]); + return pp[i]; +} +#endif + +static +void finalize_disjoint_sets ( + void + ) +{ + SUPERLU_FREE(pp); +} + + +/* + * Find the elimination tree for A'*A. + * This uses something similar to Liu's algorithm. + * It runs in time O(nz(A)*log n) and does not form A'*A. + * + * Input: + * Sparse matrix A. Numeric values are ignored, so any + * explicit zeros are treated as nonzero. + * Output: + * Integer array of parents representing the elimination + * tree of the symbolic product A'*A. Each vertex is a + * column of A, and nc means a root of the elimination forest. + * + * John R. Gilbert, Xerox, 10 Dec 1990 + * Based on code by JRG dated 1987, 1988, and 1990. + */ + +/* + * Nonsymmetric elimination tree + */ +int +sp_coletree( + int *acolst, int *acolend, /* column start and end past 1 */ + int *arow, /* row indices of A */ + int nr, int nc, /* dimension of A */ + int *parent /* parent in elim tree */ + ) +{ + int *root; /* root of subtee of etree */ + int *firstcol; /* first nonzero col in each row*/ + int rset, cset; + int row, col; + int rroot; + int p; + + root = mxCallocInt (nc); + initialize_disjoint_sets (nc); + + /* Compute firstcol[row] = first nonzero column in row */ + + firstcol = mxCallocInt (nr); + for (row = 0; row < nr; firstcol[row++] = nc); + for (col = 0; col < nc; col++) + for (p = acolst[col]; p < acolend[col]; p++) { + row = arow[p]; + firstcol[row] = SUPERLU_MIN(firstcol[row], col); + } + + /* Compute etree by Liu's algorithm for symmetric matrices, + except use (firstcol[r],c) in place of an edge (r,c) of A. + Thus each row clique in A'*A is replaced by a star + centered at its first vertex, which has the same fill. */ + + for (col = 0; col < nc; col++) { + cset = make_set (col); + root[cset] = col; + parent[col] = nc; /* Matlab */ + for (p = acolst[col]; p < acolend[col]; p++) { + row = firstcol[arow[p]]; + if (row >= col) continue; + rset = find (row); + rroot = root[rset]; + if (rroot != col) { + parent[rroot] = col; + cset = link (cset, rset); + root[cset] = col; + } + } + } + + SUPERLU_FREE (root); + SUPERLU_FREE (firstcol); + finalize_disjoint_sets (); + return 0; +} + +/* + * q = TreePostorder (n, p); + * + * Postorder a tree. + * Input: + * p is a vector of parent pointers for a forest whose + * vertices are the integers 0 to n-1; p[root]==n. + * Output: + * q is a vector indexed by 0..n-1 such that q[i] is the + * i-th vertex in a postorder numbering of the tree. + * + * ( 2/7/95 modified by X.Li: + * q is a vector indexed by 0:n-1 such that vertex i is the + * q[i]-th vertex in a postorder numbering of the tree. + * That is, this is the inverse of the previous q. ) + * + * In the child structure, lower-numbered children are represented + * first, so that a tree which is already numbered in postorder + * will not have its order changed. + * + * Written by John Gilbert, Xerox, 10 Dec 1990. + * Based on code written by John Gilbert at CMI in 1987. + */ + +static int *first_kid, *next_kid; /* Linked list of children. */ +static int *post, postnum; + +static +/* + * Depth-first search from vertex v. + */ +void etdfs ( + int v + ) +{ + int w; + + for (w = first_kid[v]; w != -1; w = next_kid[w]) { + etdfs (w); + } + /* post[postnum++] = v; in Matlab */ + post[v] = postnum++; /* Modified by X.Li on 2/14/95 */ +} + + +/* + * Post order a tree + */ +int *TreePostorder( + int n, + int *parent +) +{ + int v, dad; + + /* Allocate storage for working arrays and results */ + first_kid = mxCallocInt (n+1); + next_kid = mxCallocInt (n+1); + post = mxCallocInt (n+1); + + /* Set up structure describing children */ + for (v = 0; v <= n; first_kid[v++] = -1); + for (v = n-1; v >= 0; v--) { + dad = parent[v]; + next_kid[v] = first_kid[dad]; + first_kid[dad] = v; + } + + /* Depth-first search from dummy root vertex #n */ + postnum = 0; + etdfs (n); + + SUPERLU_FREE (first_kid); + SUPERLU_FREE (next_kid); + return post; +} + + +/* + * p = spsymetree (A); + * + * Find the elimination tree for symmetric matrix A. + * This uses Liu's algorithm, and runs in time O(nz*log n). + * + * Input: + * Square sparse matrix A. No check is made for symmetry; + * elements below and on the diagonal are ignored. + * Numeric values are ignored, so any explicit zeros are + * treated as nonzero. + * Output: + * Integer array of parents representing the etree, with n + * meaning a root of the elimination forest. + * Note: + * This routine uses only the upper triangle, while sparse + * Cholesky (as in spchol.c) uses only the lower. Matlab's + * dense Cholesky uses only the upper. This routine could + * be modified to use the lower triangle either by transposing + * the matrix or by traversing it by rows with auxiliary + * pointer and link arrays. + * + * John R. Gilbert, Xerox, 10 Dec 1990 + * Based on code by JRG dated 1987, 1988, and 1990. + * Modified by X.S. Li, November 1999. + */ + +/* + * Symmetric elimination tree + */ +int +sp_symetree( + int *acolst, int *acolend, /* column starts and ends past 1 */ + int *arow, /* row indices of A */ + int n, /* dimension of A */ + int *parent /* parent in elim tree */ + ) +{ + int *root; /* root of subtree of etree */ + int rset, cset; + int row, col; + int rroot; + int p; + + root = mxCallocInt (n); + initialize_disjoint_sets (n); + + for (col = 0; col < n; col++) { + cset = make_set (col); + root[cset] = col; + parent[col] = n; /* Matlab */ + for (p = acolst[col]; p < acolend[col]; p++) { + row = arow[p]; + if (row >= col) continue; + rset = find (row); + rroot = root[rset]; + if (rroot != col) { + parent[rroot] = col; + cset = link (cset, rset); + root[cset] = col; + } + } + } + SUPERLU_FREE (root); + finalize_disjoint_sets (); + return 0; +} /* SP_SYMETREE */ diff --git a/intern/opennl/superlu/sp_ienv.c b/intern/opennl/superlu/sp_ienv.c new file mode 100644 index 00000000000..5b0ba7b2151 --- /dev/null +++ b/intern/opennl/superlu/sp_ienv.c @@ -0,0 +1,65 @@ +/* + * File name: sp_ienv.c + * History: Modified from lapack routine ILAENV + */ + +#include "ssp_defs.h" +#include "util.h" + +int +sp_ienv(int ispec) +{ +/* + Purpose + ======= + + sp_ienv() is inquired to choose machine-dependent parameters for the + local environment. See ISPEC for a description of the parameters. + + This version provides a set of parameters which should give good, + but not optimal, performance on many of the currently available + computers. Users are encouraged to modify this subroutine to set + the tuning parameters for their particular machine using the option + and problem size information in the arguments. + + Arguments + ========= + + ISPEC (input) int + Specifies the parameter to be returned as the value of SP_IENV. + = 1: the panel size w; a panel consists of w consecutive + columns of matrix A in the process of Gaussian elimination. + The best value depends on machine's cache characters. + = 2: the relaxation parameter relax; if the number of + nodes (columns) in a subtree of the elimination tree is less + than relax, this subtree is considered as one supernode, + regardless of their row structures. + = 3: the maximum size for a supernode; + = 4: the minimum row dimension for 2-D blocking to be used; + = 5: the minimum column dimension for 2-D blocking to be used; + = 6: the estimated fills factor for L and U, compared with A; + + (SP_IENV) (output) int + >= 0: the value of the parameter specified by ISPEC + < 0: if SP_IENV = -k, the k-th argument had an illegal value. + + ===================================================================== +*/ + int i; + + switch (ispec) { + case 1: return (10); + case 2: return (5); + case 3: return (100); + case 4: return (200); + case 5: return (40); + case 6: return (20); + } + + /* Invalid value for ISPEC */ + i = 1; + xerbla_("sp_ienv", &i); + return 0; + +} /* sp_ienv_ */ + diff --git a/intern/opennl/superlu/sp_preorder.c b/intern/opennl/superlu/sp_preorder.c new file mode 100644 index 00000000000..f82da2de1aa --- /dev/null +++ b/intern/opennl/superlu/sp_preorder.c @@ -0,0 +1,203 @@ +#include "ssp_defs.h" + +void +sp_preorder(superlu_options_t *options, SuperMatrix *A, int *perm_c, + int *etree, SuperMatrix *AC) +{ +/* + * Purpose + * ======= + * + * sp_preorder() permutes the columns of the original matrix. It performs + * the following steps: + * + * 1. Apply column permutation perm_c[] to A's column pointers to form AC; + * + * 2. If options->Fact = DOFACT, then + * (1) Compute column elimination tree etree[] of AC'AC; + * (2) Post order etree[] to get a postordered elimination tree etree[], + * and a postorder permutation post[]; + * (3) Apply post[] permutation to columns of AC; + * (4) Overwrite perm_c[] with the product perm_c * post. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * Specifies whether or not the elimination tree will be re-used. + * If options->Fact == DOFACT, this means first time factor A, + * etree is computed, postered, and output. + * Otherwise, re-factor A, etree is input, unchanged on exit. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A can be: + * Stype = NC or SLU_NCP; Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * perm_c (input/output) int* + * Column permutation vector of size A->ncol, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * If options->Fact == DOFACT, perm_c is both input and output. + * On output, it is changed according to a postorder of etree. + * Otherwise, perm_c is input. + * + * etree (input/output) int* + * Elimination tree of Pc'*A'*A*Pc, dimension A->ncol. + * If options->Fact == DOFACT, etree is an output argument, + * otherwise it is an input argument. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * + * AC (output) SuperMatrix* + * The resulting matrix after applied the column permutation + * perm_c[] to matrix A. The type of AC can be: + * Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE. + * + */ + + NCformat *Astore; + NCPformat *ACstore; + int *iwork, *post; + register int n, i; + + n = A->ncol; + + /* Apply column permutation perm_c to A's column pointers so to + obtain NCP format in AC = A*Pc. */ + AC->Stype = SLU_NCP; + AC->Dtype = A->Dtype; + AC->Mtype = A->Mtype; + AC->nrow = A->nrow; + AC->ncol = A->ncol; + Astore = A->Store; + ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) ); + if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore"); + ACstore->nnz = Astore->nnz; + ACstore->nzval = Astore->nzval; + ACstore->rowind = Astore->rowind; + ACstore->colbeg = (int*) SUPERLU_MALLOC(n*sizeof(int)); + if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg"); + ACstore->colend = (int*) SUPERLU_MALLOC(n*sizeof(int)); + if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend"); + +#ifdef DEBUG + print_int_vec("pre_order:", n, perm_c); + check_perm("Initial perm_c", n, perm_c); +#endif + + for (i = 0; i < n; i++) { + ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; + ACstore->colend[perm_c[i]] = Astore->colptr[i+1]; + } + + if ( options->Fact == DOFACT ) { +#undef ETREE_ATplusA +#ifdef ETREE_ATplusA + /*-------------------------------------------- + COMPUTE THE ETREE OF Pc*(A'+A)*Pc'. + --------------------------------------------*/ + int *b_colptr, *b_rowind, bnz, j; + int *c_colbeg, *c_colend; + + /*printf("Use etree(A'+A)\n");*/ + + /* Form B = A + A'. */ + at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); + + /* Form C = Pc*B*Pc'. */ + c_colbeg = (int*) SUPERLU_MALLOC(2*n*sizeof(int)); + c_colend = c_colbeg + n; + if (!c_colbeg ) ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend"); + for (i = 0; i < n; i++) { + c_colbeg[perm_c[i]] = b_colptr[i]; + c_colend[perm_c[i]] = b_colptr[i+1]; + } + for (j = 0; j < n; ++j) { + for (i = c_colbeg[j]; i < c_colend[j]; ++i) { + b_rowind[i] = perm_c[b_rowind[i]]; + } + } + + /* Compute etree of C. */ + sp_symetree(c_colbeg, c_colend, b_rowind, n, etree); + + SUPERLU_FREE(b_colptr); + if ( bnz ) SUPERLU_FREE(b_rowind); + SUPERLU_FREE(c_colbeg); + +#else + /*-------------------------------------------- + COMPUTE THE COLUMN ELIMINATION TREE. + --------------------------------------------*/ + sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind, + A->nrow, A->ncol, etree); +#endif +#ifdef DEBUG + print_int_vec("etree:", n, etree); +#endif + + /* In symmetric mode, do not do postorder here. */ + if ( options->SymmetricMode == NO ) { + /* Post order etree */ + post = (int *) TreePostorder(n, etree); + /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + iwork = post; */ + +#ifdef DEBUG + print_int_vec("post:", n+1, post); + check_perm("post", n, post); +#endif + iwork = (int*) SUPERLU_MALLOC((n+1)*sizeof(int)); + if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]"); + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]]; + for (i = 0; i < n; ++i) etree[i] = iwork[i]; + +#ifdef DEBUG + print_int_vec("postorder etree:", n, etree); +#endif + + /* Postmultiply A*Pc by post[] */ + for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i]; + for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i]; + for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i]; + for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i]; + + for (i = 0; i < n; ++i) + iwork[i] = post[perm_c[i]]; /* product of perm_c and post */ + for (i = 0; i < n; ++i) perm_c[i] = iwork[i]; + +#ifdef DEBUG + print_int_vec("Pc*post:", n, perm_c); + check_perm("final perm_c", n, perm_c); +#endif + SUPERLU_FREE (post); + SUPERLU_FREE (iwork); + } /* end postordering */ + + } /* if options->Fact == DOFACT ... */ + +} + +int check_perm(char *what, int n, int *perm) +{ + register int i; + int *marker; + marker = (int *) calloc(n, sizeof(int)); + + for (i = 0; i < n; ++i) { + if ( marker[perm[i]] == 1 || perm[i] >= n ) { + printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]); + ABORT("check_perm"); + } else { + marker[perm[i]] = 1; + } + } + + SUPERLU_FREE(marker); + return 0; +} diff --git a/intern/opennl/superlu/spanel_bmod.c b/intern/opennl/superlu/spanel_bmod.c new file mode 100644 index 00000000000..a59a9086df1 --- /dev/null +++ b/intern/opennl/superlu/spanel_bmod.c @@ -0,0 +1,449 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include +#include +#include "ssp_defs.h" + +/* + * Function prototypes + */ +void slsolve(int, int, float *, float *); +void smatvec(int, int, int, float *, float *, float *); +extern void scheck_tempv(); + +void +spanel_bmod ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + const int nseg, /* in */ + float *dense, /* out, of size n by w */ + float *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in, of size n by w */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ +/* + * Purpose + * ======= + * + * Performs numeric block updates (sup-panel) in topological order. + * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. + * Special processing on the supernodal portion of L\U[*,j] + * + * Before entering this routine, the original nonzeros in the panel + * were already copied into the spa[m,w]. + * + * Updated/Output parameters- + * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned + * collectively in the m-by-w vector dense[*]. + * + */ + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + float alpha, beta; +#endif + + register int k, ksub; + int fsupc, nsupc, nsupr, nrow; + int krep, krep_ind; + float ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int segsze; + int block_nrow; /* no of rows in a block row */ + register int lptr; /* Points to the row subscripts of a supernode */ + int kfnz, irow, no_zeros; + register int isub, isub1, i; + register int jj; /* Index through each column in the panel */ + int *xsup, *supno; + int *lsub, *xlsub; + float *lusup; + int *xlusup; + int *repfnz_col; /* repfnz[] for a column in the panel */ + float *dense_col; /* dense[] for a column in the panel */ + float *tempv1; /* Used in 1-D update */ + float *TriTmp, *MatvecTmp; /* used in 2-D update */ + float zero = 0.0; + register int ldaTmp; + register int r_ind, r_hi; + static int first = 1, maxsuper, rowblk, colblk; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + if ( first ) { + maxsuper = sp_ienv(3); + rowblk = sp_ienv(4); + colblk = sp_ienv(5); + first = 0; + } + ldaTmp = maxsuper + rowblk; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in a supernode + * nsupr = no of rows in a supernode + */ + krep = segrep[k--]; + fsupc = xsup[supno[krep]]; + nsupc = krep - fsupc + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nrow = nsupr - nsupc; + lptr = xlsub[fsupc]; + krep_ind = lptr + nsupc - 1; + + repfnz_col = repfnz; + dense_col = dense; + + if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ + + TriTmp = tempv; + + /* Sequence through each column in panel -- triangular solves */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + dense_col[irow] -= ukj * lusup[luptr]; + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + ukj -= ukj1 * lusup[luptr1]; + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; + dense_col[irow] -= (ukj*lusup[luptr] + + ukj1*lusup[luptr1]); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; luptr2++; + dense_col[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + } else { /* segsze >= 4 */ + + /* Copy U[*,j] segment from dense[*] to TriTmp[*], which + holds the result of triangular solves. */ + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + TriTmp[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#else + strsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#endif +#else + slsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); +#endif + + + } /* else ... */ + + } /* for jj ... end tri-solves */ + + /* Block row updates; push all the way into dense[*] block */ + for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { + + r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); + block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); + luptr = xlusup[fsupc] + nsupc + r_ind; + isub1 = lptr + nsupc + r_ind; + + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + /* Sequence through each column in panel -- matrix-vector */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + /* Perform a block update, and scatter the result of + matrix-vector to dense[]. */ + no_zeros = kfnz - fsupc; + luptr1 = luptr + nsupr * no_zeros; + MatvecTmp = &TriTmp[maxsuper]; + +#ifdef USE_VENDOR_BLAS + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#else + sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#endif +#else + smatvec(nsupr, block_nrow, segsze, &lusup[luptr1], + TriTmp, MatvecTmp); +#endif + + /* Scatter MatvecTmp[*] into SPA dense[*] temporarily + * such that MatvecTmp[*] can be re-used for the + * the next blok row update. dense[] will be copied into + * global store after the whole panel has been finished. + */ + isub = isub1; + for (i = 0; i < block_nrow; i++) { + irow = lsub[isub]; + dense_col[irow] -= MatvecTmp[i]; + MatvecTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } /* for each block row ... */ + + /* Scatter the triangular solves into SPA dense[*] */ + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = TriTmp[i]; + TriTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } else { /* 1-D block modification */ + + + /* Sequence through each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + dense_col[irow] -= ukj * lusup[luptr]; + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + ukj -= ukj1 * lusup[luptr1]; + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; + dense_col[irow] -= (ukj*lusup[luptr] + + ukj1*lusup[luptr1]); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; ++luptr2; + dense_col[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + } else { /* segsze >= 4 */ + /* + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense[]. + */ + no_zeros = kfnz - fsupc; + + /* Copy U[*,j] segment from dense[*] to tempv[*]: + * The result of triangular solve is in tempv[*]; + * The result of matrix vector update is in dense_col[*] + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + tempv[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + strsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + slsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); +#endif + + /* Scatter tempv[*] into SPA dense[*] temporarily, such + * that tempv[*] can be used for the triangular solve of + * the next column of the panel. They will be copied into + * ucol[*] after the whole panel has been finished. + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = tempv[i]; + tempv[i] = zero; + isub++; + } + + /* Scatter the update from tempv1[*] into SPA dense[*] */ + /* Start dense rectangular L */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + dense_col[irow] -= tempv1[i]; + tempv1[i] = zero; + ++isub; + } + + } /* else segsze>=4 ... */ + + } /* for each column in the panel... */ + + } /* else 1-D update ... */ + + } /* for each updating supernode ... */ + +} + + + diff --git a/intern/opennl/superlu/spanel_dfs.c b/intern/opennl/superlu/spanel_dfs.c new file mode 100644 index 00000000000..7f5f3c7532a --- /dev/null +++ b/intern/opennl/superlu/spanel_dfs.c @@ -0,0 +1,249 @@ + + +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" +#include "util.h" + +void +spanel_dfs ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + float *dense, /* out */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *xprune, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* + * Purpose + * ======= + * + * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. + * + * The routine returns one list of the supernodal representatives + * in topological order of the dfs that generates them. This list is + * a superset of the topological order of each individual column within + * the panel. + * The location of the first nonzero in each supernodal segment + * (supernodal entry location) is also returned. Each column has a + * separate list for this purpose. + * + * Two marker arrays are used for dfs: + * marker[i] == jj, if i was visited during dfs of current column jj; + * marker1[i] >= jcol, if i was visited by earlier columns in this panel; + * + * marker: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + * + */ + NCPformat *Astore; + float *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + float *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + + } /* for jj ... */ + +} diff --git a/intern/opennl/superlu/spivotL.c b/intern/opennl/superlu/spivotL.c new file mode 100644 index 00000000000..6243065bb5b --- /dev/null +++ b/intern/opennl/superlu/spivotL.c @@ -0,0 +1,173 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include +#include +#include "ssp_defs.h" + +#undef DEBUG + +int +spivotL( + const int jcol, /* in */ + const float u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int *iperm_r, /* in - inverse of perm_r */ + int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ + int *pivrow, /* out */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ +/* + * Purpose + * ======= + * Performs the numerical pivoting on the current column of L, + * and the CDIV operation. + * + * Pivot policy: + * (1) Compute thresh = u * max_(i>=j) abs(A_ij); + * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN + * pivot row = k; + * ELSE IF abs(A_jj) >= thresh THEN + * pivot row = j; + * ELSE + * pivot row = m; + * + * Note: If you absolutely want to use a given pivot order, then set u=0.0. + * + * Return value: 0 success; + * i > 0 U(i,i) is exactly zero. + * + */ + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + int pivptr, old_pivptr, diag, diagind; + float pivmax, rtemp, thresh; + float temp; + float *lu_sup_ptr; + float *lu_col_ptr; + int *lsub_ptr; + int isub, icol, k, itemp; + int *lsub, *xlsub; + float *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + /* Initialize pointers */ + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + +#ifdef DEBUG +if ( jcol == MIN_COL ) { + printf("Before cdiv: col %d\n", jcol); + for (k = nsupc; k < nsupr; k++) + printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); +} +#endif + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + if ( *usepr ) *pivrow = iperm_r[jcol]; + diagind = iperm_c[jcol]; + pivmax = 0.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + for (isub = nsupc; isub < nsupr; ++isub) { + rtemp = fabs (lu_col_ptr[isub]); + if ( rtemp > pivmax ) { + pivmax = rtemp; + pivptr = isub; + } + if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; + if ( lsub_ptr[isub] == diagind ) diag = isub; + } + + /* Test for singularity */ + if ( pivmax == 0.0 ) { + *pivrow = lsub_ptr[pivptr]; + perm_r[*pivrow] = jcol; + *usepr = 0; + return (jcol+1); + } + + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + rtemp = fabs (lu_col_ptr[old_pivptr]); + if ( rtemp != 0.0 && rtemp >= thresh ) + pivptr = old_pivptr; + else + *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + rtemp = fabs (lu_col_ptr[diag]); + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += nsupr - nsupc; + + temp = 1.0 / lu_col_ptr[nsupc]; + for (k = nsupc+1; k < nsupr; k++) + lu_col_ptr[k] *= temp; + + return 0; +} + diff --git a/intern/opennl/superlu/spruneL.c b/intern/opennl/superlu/spruneL.c new file mode 100644 index 00000000000..59702706375 --- /dev/null +++ b/intern/opennl/superlu/spruneL.c @@ -0,0 +1,149 @@ + + +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" +#include "util.h" + +void +spruneL( + const int jcol, /* in */ + const int *perm_r, /* in */ + const int pivrow, /* in */ + const int nseg, /* in */ + const int *segrep, /* in */ + const int *repfnz, /* in */ + int *xprune, /* out */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ +/* + * Purpose + * ======= + * Prunes the L-structure of supernodes whose L-structure + * contains the current pivot row "pivrow" + * + */ + float utemp; + int jsupno, irep, irep1, kmin, kmax, krow, movnum; + int i, ktemp, minloc, maxloc; + int do_prune; /* logical variable */ + int *xsup, *supno; + int *lsub, *xlsub; + float *lusup; + int *xlusup; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + /* + * For each supernode-rep irep in U[*,j] + */ + jsupno = supno[jcol]; + for (i = 0; i < nseg; i++) { + + irep = segrep[i]; + irep1 = irep + 1; + do_prune = FALSE; + + /* Don't prune with a zero U-segment */ + if ( repfnz[irep] == EMPTY ) + continue; + + /* If a snode overlaps with the next panel, then the U-segment + * is fragmented into two parts -- irep and irep1. We should let + * pruning occur at the rep-column in irep1's snode. + */ + if ( supno[irep] == supno[irep1] ) /* Don't prune */ + continue; + + /* + * If it has not been pruned & it has a nonz in row L[pivrow,i] + */ + if ( supno[irep] != jsupno ) { + if ( xprune[irep] >= xlsub[irep1] ) { + kmin = xlsub[irep]; + kmax = xlsub[irep1] - 1; + for (krow = kmin; krow <= kmax; krow++) + if ( lsub[krow] == pivrow ) { + do_prune = TRUE; + break; + } + } + + if ( do_prune ) { + + /* Do a quicksort-type partition + * movnum=TRUE means that the num values have to be exchanged. + */ + movnum = FALSE; + if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ + movnum = TRUE; + + while ( kmin <= kmax ) { + + if ( perm_r[lsub[kmax]] == EMPTY ) + kmax--; + else if ( perm_r[lsub[kmin]] != EMPTY ) + kmin++; + else { /* kmin below pivrow, and kmax above pivrow: + * interchange the two subscripts + */ + ktemp = lsub[kmin]; + lsub[kmin] = lsub[kmax]; + lsub[kmax] = ktemp; + + /* If the supernode has only one column, then we + * only keep one set of subscripts. For any subscript + * interchange performed, similar interchange must be + * done on the numerical values. + */ + if ( movnum ) { + minloc = xlusup[irep] + (kmin - xlsub[irep]); + maxloc = xlusup[irep] + (kmax - xlsub[irep]); + utemp = lusup[minloc]; + lusup[minloc] = lusup[maxloc]; + lusup[maxloc] = utemp; + } + + kmin++; + kmax--; + + } + + } /* while */ + + xprune[irep] = kmin; /* Pruning */ + +#ifdef CHK_PRUNE + printf(" After spruneL(),using col %d: xprune[%d] = %d\n", + jcol, irep, kmin); +#endif + } /* if do_prune */ + + } /* if */ + + } /* for each U-segment... */ +} diff --git a/intern/opennl/superlu/ssnode_bmod.c b/intern/opennl/superlu/ssnode_bmod.c new file mode 100644 index 00000000000..fe97abd9ff6 --- /dev/null +++ b/intern/opennl/superlu/ssnode_bmod.c @@ -0,0 +1,117 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" + +void slsolve(int, int, float*, float*); +void smatvec(int, int, int, float*, float*, float*); + +/* + * Performs numeric block updates within the relaxed snode. + */ +int +ssnode_bmod ( + const int jcol, /* in */ + const int fsupc, /* in */ + float *dense, /* in */ + float *tempv, /* working array */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + float alpha = -1.0, beta = 1.0; +#endif + + int luptr, nsupc, nsupr, nrow; + int isub, irow, i, iptr; + register int ufirst, nextlu; + int *lsub, *xlsub; + float *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + nextlu = xlusup[jcol]; + + /* + * Process the supernodal portion of L\U[*,j] + */ + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = 0; + ++nextlu; + } + + xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ + + if ( fsupc < jcol ) { + + luptr = xlusup[fsupc]; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nsupc = jcol - fsupc; /* Excluding jcol */ + ufirst = xlusup[jcol]; /* Points to the beginning of column + jcol in supernode L\U(jsupno). */ + nrow = nsupr - nsupc; + + ops[TRSV] += nsupc * (nsupc - 1); + ops[GEMV] += 2 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, + &lusup[ufirst], &incx ); + SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, + &lusup[ufirst], &incx ); + sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], &tempv[0] ); + + /* Scatter tempv[*] into lusup[*] */ + iptr = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + lusup[iptr++] -= tempv[i]; + tempv[i] = 0.0; + } +#endif + + } + + return 0; +} diff --git a/intern/opennl/superlu/ssnode_dfs.c b/intern/opennl/superlu/ssnode_dfs.c new file mode 100644 index 00000000000..c8974237a9a --- /dev/null +++ b/intern/opennl/superlu/ssnode_dfs.c @@ -0,0 +1,106 @@ + + +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include "ssp_defs.h" +#include "util.h" + +int +ssnode_dfs ( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *xprune, /* out */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* Purpose + * ======= + * ssnode_dfs() - Determine the union of the row structures of those + * columns within the relaxed snode. + * Note: The relaxed snodes are leaves of the supernodal etree, therefore, + * the portion outside the rectangular supernode must be zero. + * + * Return value + * ============ + * 0 success; + * >0 number of bytes allocated when run out of memory. + * + */ + register int i, k, ifrom, ito, nextl, new_next; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) { + if ( (mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)) ) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1, then make a copy of the subscripts for pruning */ + if ( jcol < kcol ) { + new_next = nextl + (nextl - xlsub[jcol]); + while ( new_next > nzlmax ) { + if ( (mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)) ) + return (mem_error); + lsub = Glu->lsub; + } + ito = nextl; + for (ifrom = xlsub[jcol]; ifrom < nextl; ) + lsub[ito++] = lsub[ifrom++]; + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + nextl = ito; + } + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xprune[kcol] = nextl; + xlsub[kcol+1] = nextl; + + return 0; +} + diff --git a/intern/opennl/superlu/ssp_blas2.c b/intern/opennl/superlu/ssp_blas2.c new file mode 100644 index 00000000000..347f9ab5fd4 --- /dev/null +++ b/intern/opennl/superlu/ssp_blas2.c @@ -0,0 +1,469 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + * File name: ssp_blas2.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include "ssp_defs.h" + +/* + * Function prototypes + */ +void susolve(int, int, float*, float*); +void slsolve(int, int, float*, float*); +void smatvec(int, int, int, float*, float*, float*); +int strsv_(char*, char*, char*, int*, float*, int*, float*, int*); + +int +sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, + SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info) +{ +/* + * Purpose + * ======= + * + * sp_strsv() solves one of the systems of equations + * A*x = b, or A'*x = b, + * where b and x are n element vectors and A is a sparse unit , or + * non-unit, upper or lower triangular matrix. + * No test for singularity or near-singularity is included in this + * routine. Such tests must be performed before calling this routine. + * + * Parameters + * ========== + * + * uplo - (input) char* + * On entry, uplo specifies whether the matrix is an upper or + * lower triangular matrix as follows: + * uplo = 'U' or 'u' A is an upper triangular matrix. + * uplo = 'L' or 'l' A is a lower triangular matrix. + * + * trans - (input) char* + * On entry, trans specifies the equations to be solved as + * follows: + * trans = 'N' or 'n' A*x = b. + * trans = 'T' or 't' A'*x = b. + * trans = 'C' or 'c' A'*x = b. + * + * diag - (input) char* + * On entry, diag specifies whether or not A is unit + * triangular as follows: + * diag = 'U' or 'u' A is assumed to be unit triangular. + * diag = 'N' or 'n' A is not assumed to be unit + * triangular. + * + * L - (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U. Use + * compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU. + * + * U - (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. + * U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU. + * + * x - (input/output) float* + * Before entry, the incremented array X must contain the n + * element right-hand side vector b. On exit, X is overwritten + * with the solution vector x. + * + * info - (output) int* + * If *info = -i, the i-th argument had an illegal value. + * + */ +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + SCformat *Lstore; + NCformat *Ustore; + float *Lval, *Uval; + int incx = 1; + int nrow; + int fsupc, nsupr, nsupc, luptr, istart, irow; + int i, k, iptr, jcol; + float *work; + flops_t solve_ops; + + /* Test the input parameters */ + *info = 0; + if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; + else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && + !lsame_(trans, "C")) *info = -2; + else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; + else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; + else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; + if ( *info ) { + i = -(*info); + xerbla_("sp_strsv", &i); + return 0; + } + + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( !(work = floatCalloc(L->nrow)) ) + ABORT("Malloc fails for work in sp_strsv()."); + + if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L)*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + nrow = nsupr - nsupc; + + solve_ops += nsupc * (nsupc - 1); + solve_ops += 2 * nrow * nsupc; + + if ( nsupc == 1 ) { + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { + irow = L_SUB(iptr); + ++luptr; + x[irow] -= x[fsupc] * Lval[luptr]; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#else + strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#endif +#else + slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); + + smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], + &x[fsupc], &work[0] ); +#endif + + iptr = istart + nsupc; + for (i = 0; i < nrow; ++i, ++iptr) { + irow = L_SUB(iptr); + x[irow] -= work[i]; /* Scatter */ + work[i] = 0.0; + + } + } + } /* for k ... */ + + } else { + /* Form x := inv(U)*x */ + + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += nsupc * (nsupc + 1); + + if ( nsupc == 1 ) { + x[fsupc] /= Lval[luptr]; + for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { + irow = U_SUB(i); + x[irow] -= x[fsupc] * Uval[i]; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif +#else + susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); +#endif + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); + i++) { + irow = U_SUB(i); + x[irow] -= x[jcol] * Uval[i]; + } + } + } + } /* for k ... */ + + } + } else { /* Form x := inv(A')*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L')*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 2 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + x[jcol] -= x[irow] * Lval[i]; + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := inv(U')*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + x[jcol] -= x[irow] * Uval[i]; + } + } + + solve_ops += nsupc * (nsupc + 1); + + if ( nsupc == 1 ) { + x[fsupc] /= Lval[luptr]; + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } + + stat->ops[SOLVE] += solve_ops; + SUPERLU_FREE(work); + return 0; +} + + + + +int +sp_sgemv(char *trans, float alpha, SuperMatrix *A, float *x, + int incx, float beta, float *y, int incy) +{ +/* Purpose + ======= + + sp_sgemv() performs one of the matrix-vector operations + y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, + where alpha and beta are scalars, x and y are vectors and A is a + sparse A->nrow by A->ncol matrix. + + Parameters + ========== + + TRANS - (input) char* + On entry, TRANS specifies the operation to be performed as + follows: + TRANS = 'N' or 'n' y := alpha*A*x + beta*y. + TRANS = 'T' or 't' y := alpha*A'*x + beta*y. + TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. + + ALPHA - (input) float + On entry, ALPHA specifies the scalar alpha. + + A - (input) SuperMatrix* + Matrix A with a sparse format, of dimension (A->nrow, A->ncol). + Currently, the type of A can be: + Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. + In the future, more general A can be handled. + + X - (input) float*, array of DIMENSION at least + ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' + and at least + ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. + Before entry, the incremented array X must contain the + vector x. + + INCX - (input) int + On entry, INCX specifies the increment for the elements of + X. INCX must not be zero. + + BETA - (input) float + On entry, BETA specifies the scalar beta. When BETA is + supplied as zero then Y need not be set on input. + + Y - (output) float*, array of DIMENSION at least + ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' + and at least + ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. + Before entry with BETA non-zero, the incremented array Y + must contain the vector y. On exit, Y is overwritten by the + updated vector y. + + INCY - (input) int + On entry, INCY specifies the increment for the elements of + Y. INCY must not be zero. + + ==== Sparse Level 2 Blas routine. +*/ + + /* Local variables */ + NCformat *Astore; + float *Aval; + int info; + float temp; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + + notran = lsame_(trans, "N"); + Astore = A->Store; + Aval = Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerbla_("sp_sgemv ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) + return 0; + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if (lsame_(trans, "N")) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if (beta != 1.) { + if (incy == 1) { + if (beta == 0.) + for (i = 0; i < leny; ++i) y[i] = 0.; + else + for (i = 0; i < leny; ++i) y[i] = beta * y[i]; + } else { + iy = ky; + if (beta == 0.) + for (i = 0; i < leny; ++i) { + y[iy] = 0.; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + y[iy] = beta * y[iy]; + iy += incy; + } + } + } + + if (alpha == 0.) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if (x[jx] != 0.) { + temp = alpha * x[jx]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + y[irow] += temp * Aval[i]; + } + } + jx += incx; + } + } else { + ABORT("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp += Aval[i] * x[irow]; + } + y[jy] += alpha * temp; + jy += incy; + } + } else { + ABORT("Not implemented."); + } + } + return 0; +} /* sp_sgemv */ + + + diff --git a/intern/opennl/superlu/ssp_blas3.c b/intern/opennl/superlu/ssp_blas3.c new file mode 100644 index 00000000000..19086077c4c --- /dev/null +++ b/intern/opennl/superlu/ssp_blas3.c @@ -0,0 +1,121 @@ + + +/* + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + */ +/* + * File name: sp_blas3.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include "ssp_defs.h" +#include "util.h" + +int +sp_sgemm(char *transa, int n, + float alpha, SuperMatrix *A, float *b, int ldb, + float beta, float *c, int ldc) +{ +/* Purpose + ======= + + sp_s performs one of the matrix-matrix operations + + C := alpha*op( A )*op( B ) + beta*C, + + where op( X ) is one of + + op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), + + alpha and beta are scalars, and A, B and C are matrices, with op( A ) + an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. + + + Parameters + ========== + + TRANSA - (input) char* + On entry, TRANSA specifies the form of op( A ) to be used in + the matrix multiplication as follows: + TRANSA = 'N' or 'n', op( A ) = A. + TRANSA = 'T' or 't', op( A ) = A'. + TRANSA = 'C' or 'c', op( A ) = conjg( A' ). + Unchanged on exit. + + TRANSB - (input) char* + On entry, TRANSB specifies the form of op( B ) to be used in + the matrix multiplication as follows: + TRANSB = 'N' or 'n', op( B ) = B. + TRANSB = 'T' or 't', op( B ) = B'. + TRANSB = 'C' or 'c', op( B ) = conjg( B' ). + Unchanged on exit. + + M - (input) int + On entry, M specifies the number of rows of the matrix + op( A ) and of the matrix C. M must be at least zero. + Unchanged on exit. + + N - (input) int + On entry, N specifies the number of columns of the matrix + op( B ) and the number of columns of the matrix C. N must be + at least zero. + Unchanged on exit. + + K - (input) int + On entry, K specifies the number of columns of the matrix + op( A ) and the number of rows of the matrix op( B ). K must + be at least zero. + Unchanged on exit. + + ALPHA - (input) float + On entry, ALPHA specifies the scalar alpha. + + A - (input) SuperMatrix* + Matrix A with a sparse format, of dimension (A->nrow, A->ncol). + Currently, the type of A can be: + Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. + In the future, more general A can be handled. + + B - FLOAT PRECISION array of DIMENSION ( LDB, kb ), where kb is + n when TRANSB = 'N' or 'n', and is k otherwise. + Before entry with TRANSB = 'N' or 'n', the leading k by n + part of the array B must contain the matrix B, otherwise + the leading n by k part of the array B must contain the + matrix B. + Unchanged on exit. + + LDB - (input) int + On entry, LDB specifies the first dimension of B as declared + in the calling (sub) program. LDB must be at least max( 1, n ). + Unchanged on exit. + + BETA - (input) float + On entry, BETA specifies the scalar beta. When BETA is + supplied as zero then C need not be set on input. + + C - FLOAT PRECISION array of DIMENSION ( LDC, n ). + Before entry, the leading m by n part of the array C must + contain the matrix C, except when beta is zero, in which + case C need not be set on entry. + On exit, the array C is overwritten by the m by n matrix + ( alpha*op( A )*B + beta*C ). + + LDC - (input) int + On entry, LDC specifies the first dimension of C as declared + in the calling (sub)program. LDC must be at least max(1,m). + Unchanged on exit. + + ==== Sparse Level 3 Blas routine. +*/ + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_sgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/intern/opennl/superlu/ssp_defs.h b/intern/opennl/superlu/ssp_defs.h new file mode 100644 index 00000000000..5b4e86b175b --- /dev/null +++ b/intern/opennl/superlu/ssp_defs.h @@ -0,0 +1,234 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +#ifndef __SUPERLU_sSP_DEFS /* allow multiple inclusions */ +#define __SUPERLU_sSP_DEFS + +/* + * File name: ssp_defs.h + * Purpose: Sparse matrix types and function prototypes + * History: + */ + +#ifdef _CRAY +#include +#include +#endif + +/* Define my integer type int_t */ +typedef int int_t; /* default */ + +#include "Cnames.h" +#include "supermatrix.h" +#include "util.h" + + +/* + * Global data structures used in LU factorization - + * + * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper]. + * (xsup,supno): supno[i] is the supernode no to which i belongs; + * xsup(s) points to the beginning of the s-th supernode. + * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12) + * xsup 0 1 2 4 7 12 + * Note: dfs will be performed on supernode rep. relative to the new + * row pivoting ordering + * + * (xlsub,lsub): lsub[*] contains the compressed subscript of + * rectangular supernodes; xlsub[j] points to the starting + * location of the j-th column in lsub[*]. Note that xlsub + * is indexed by column. + * Storage: original row subscripts + * + * During the course of sparse LU factorization, we also use + * (xlsub,lsub) for the purpose of symmetric pruning. For each + * supernode {s,s+1,...,t=s+r} with first column s and last + * column t, the subscript set + * lsub[j], j=xlsub[s], .., xlsub[s+1]-1 + * is the structure of column s (i.e. structure of this supernode). + * It is used for the storage of numerical values. + * Furthermore, + * lsub[j], j=xlsub[t], .., xlsub[t+1]-1 + * is the structure of the last column t of this supernode. + * It is for the purpose of symmetric pruning. Therefore, the + * structural subscripts can be rearranged without making physical + * interchanges among the numerical values. + * + * However, if the supernode has only one column, then we + * only keep one set of subscripts. For any subscript interchange + * performed, similar interchange must be done on the numerical + * values. + * + * The last column structures (for pruning) will be removed + * after the numercial LU factorization phase. + * + * (xlusup,lusup): lusup[*] contains the numerical values of the + * rectangular supernodes; xlusup[j] points to the starting + * location of the j-th column in storage vector lusup[*] + * Note: xlusup is indexed by column. + * Each rectangular supernode is stored by column-major + * scheme, consistent with Fortran 2-dim array storage. + * + * (xusub,ucol,usub): ucol[*] stores the numerical values of + * U-columns outside the rectangular supernodes. The row + * subscript of nonzero ucol[k] is stored in usub[k]. + * xusub[i] points to the starting location of column i in ucol. + * Storage: new row subscripts; that is subscripts of PA. + */ +typedef struct { + int *xsup; /* supernode and column mapping */ + int *supno; + int *lsub; /* compressed L subscripts */ + int *xlsub; + float *lusup; /* L supernodes */ + int *xlusup; + float *ucol; /* U columns */ + int *usub; + int *xusub; + int nzlmax; /* current max size of lsub */ + int nzumax; /* " " " ucol */ + int nzlumax; /* " " " lusup */ + int n; /* number of columns in the matrix */ + LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ +} GlobalLU_t; + +typedef struct { + float for_lu; + float total_needed; + int expansions; +} mem_usage_t; + +#ifdef __cplusplus +extern "C" { +#endif + +/* Driver routines */ +extern void +sgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); +extern void +sgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, + char *, float *, float *, SuperMatrix *, SuperMatrix *, + void *, int, SuperMatrix *, SuperMatrix *, + float *, float *, float *, float *, + mem_usage_t *, SuperLUStat_t *, int *); + +/* Supernodal LU factor related */ +extern void +sCreate_CompCol_Matrix(SuperMatrix *, int, int, int, float *, + int *, int *, Stype_t, Dtype_t, Mtype_t); +extern void +sCreate_CompRow_Matrix(SuperMatrix *, int, int, int, float *, + int *, int *, Stype_t, Dtype_t, Mtype_t); +extern void +sCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); +extern void +sCreate_Dense_Matrix(SuperMatrix *, int, int, float *, int, + Stype_t, Dtype_t, Mtype_t); +extern void +sCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, float *, + int *, int *, int *, int *, int *, + Stype_t, Dtype_t, Mtype_t); +extern void +sCopy_Dense_Matrix(int, int, float *, int, float *, int); + +extern void countnz (const int, int *, int *, int *, GlobalLU_t *); +extern void fixupL (const int, const int *, GlobalLU_t *); + +extern void sallocateA (int, int, float **, int **, int **); +extern void sgstrf (superlu_options_t*, SuperMatrix*, + int, int, int*, void *, int, int *, int *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); +extern int ssnode_dfs (const int, const int, const int *, const int *, + const int *, int *, int *, GlobalLU_t *); +extern int ssnode_bmod (const int, const int, float *, + float *, GlobalLU_t *, SuperLUStat_t*); +extern void spanel_dfs (const int, const int, const int, SuperMatrix *, + int *, int *, float *, int *, int *, int *, + int *, int *, int *, int *, GlobalLU_t *); +extern void spanel_bmod (const int, const int, const int, const int, + float *, float *, int *, int *, + GlobalLU_t *, SuperLUStat_t*); +extern int scolumn_dfs (const int, const int, int *, int *, int *, int *, + int *, int *, int *, int *, int *, GlobalLU_t *); +extern int scolumn_bmod (const int, const int, float *, + float *, int *, int *, int, + GlobalLU_t *, SuperLUStat_t*); +extern int scopy_to_ucol (int, int, int *, int *, int *, + float *, GlobalLU_t *); +extern int spivotL (const int, const float, int *, int *, + int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); +extern void spruneL (const int, const int *, const int, const int, + const int *, const int *, int *, GlobalLU_t *); +extern void sreadmt (int *, int *, int *, float **, int **, int **); +extern void sGenXtrue (int, int, float *, int); +extern void sFillRHS (trans_t, int, float *, int, SuperMatrix *, + SuperMatrix *); +extern void sgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, + SuperMatrix *, SuperLUStat_t*, int *); + + +/* Driver related */ + +extern void sgsequ (SuperMatrix *, float *, float *, float *, + float *, float *, int *); +extern void slaqgs (SuperMatrix *, float *, float *, float, + float, float, char *); +extern void sgscon (char *, SuperMatrix *, SuperMatrix *, + float, float *, SuperLUStat_t*, int *); +extern float sPivotGrowth(int, SuperMatrix *, int *, + SuperMatrix *, SuperMatrix *); +extern void sgsrfs (trans_t, SuperMatrix *, SuperMatrix *, + SuperMatrix *, int *, int *, char *, float *, + float *, SuperMatrix *, SuperMatrix *, + float *, float *, SuperLUStat_t*, int *); + +extern int sp_strsv (char *, char *, char *, SuperMatrix *, + SuperMatrix *, float *, SuperLUStat_t*, int *); +extern int sp_sgemv (char *, float, SuperMatrix *, float *, + int, float, float *, int); + +extern int sp_sgemm (char *, int, float, + SuperMatrix *, float *, int, float, + float *, int); + +/* Memory-related */ +extern int sLUMemInit (fact_t, void *, int, int, int, int, int, + SuperMatrix *, SuperMatrix *, + GlobalLU_t *, int **, float **); +extern void sSetRWork (int, int, float *, float **, float **); +extern void sLUWorkFree (int *, float *, GlobalLU_t *); +extern int sLUMemXpand (int, int, MemType, int *, GlobalLU_t *); + +extern float *floatMalloc(int); +extern float *floatCalloc(int); +extern int smemory_usage(const int, const int, const int, const int); +extern int sQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); + +/* Auxiliary routines */ +extern void sreadhb(int *, int *, int *, float **, int **, int **); +extern void sCompRow_to_CompCol(int, int, int, float*, int*, int*, + float **, int **, int **); +extern void sfill (float *, int, float); +extern void sinf_norm_error (int, SuperMatrix *, float *); +extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, + float, float, float *, float *, char *); + +/* Routines for debugging */ +extern void sPrint_CompCol_Matrix(char *, SuperMatrix *); +extern void sPrint_SuperNode_Matrix(char *, SuperMatrix *); +extern void sPrint_Dense_Matrix(char *, SuperMatrix *); +extern void print_lu_col(char *, int, int, int *, GlobalLU_t *); +extern void check_tempv(int, float *); + +#ifdef __cplusplus + } +#endif + +#endif /* __SUPERLU_sSP_DEFS */ + diff --git a/intern/opennl/superlu/strsv.c b/intern/opennl/superlu/strsv.c new file mode 100644 index 00000000000..2f6a92c0d0d --- /dev/null +++ b/intern/opennl/superlu/strsv.c @@ -0,0 +1,331 @@ + +/* Subroutine */ int strsv_(char *uplo, char *trans, char *diag, int *n, + float *a, int *lda, float *x, int *incx) +{ + + + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int info; + static float temp; + static int i, j; + extern int lsame_(char *, char *); + static int ix, jx, kx; + extern /* Subroutine */ int xerbla_(char *, int *); + static int nounit; + + +/* Purpose + ======= + + STRSV solves one of the systems of equations + + A*x = b, or A'*x = b, + + where b and x are n element vectors and A is an n by n unit, or + non-unit, upper or lower triangular matrix. + + No test for singularity or near-singularity is included in this + routine. Such tests must be performed before calling this routine. + + Parameters + ========== + + UPLO - CHARACTER*1. + On entry, UPLO specifies whether the matrix is an upper or + lower triangular matrix as follows: + + UPLO = 'U' or 'u' A is an upper triangular matrix. + + UPLO = 'L' or 'l' A is a lower triangular matrix. + + Unchanged on exit. + + TRANS - CHARACTER*1. + On entry, TRANS specifies the equations to be solved as + follows: + + TRANS = 'N' or 'n' A*x = b. + + TRANS = 'T' or 't' A'*x = b. + + TRANS = 'C' or 'c' A'*x = b. + + Unchanged on exit. + + DIAG - CHARACTER*1. + On entry, DIAG specifies whether or not A is unit + triangular as follows: + + DIAG = 'U' or 'u' A is assumed to be unit triangular. + + DIAG = 'N' or 'n' A is not assumed to be unit + triangular. + + Unchanged on exit. + + N - INTEGER. + On entry, N specifies the order of the matrix A. + N must be at least zero. + Unchanged on exit. + + A - REAL array of DIMENSION ( LDA, n ). + Before entry with UPLO = 'U' or 'u', the leading n by n + upper triangular part of the array A must contain the upper + + triangular matrix and the strictly lower triangular part of + + A is not referenced. + Before entry with UPLO = 'L' or 'l', the leading n by n + lower triangular part of the array A must contain the lower + + triangular matrix and the strictly upper triangular part of + + A is not referenced. + Note that when DIAG = 'U' or 'u', the diagonal elements of + + A are not referenced either, but are assumed to be unity. + Unchanged on exit. + + LDA - INTEGER. + On entry, LDA specifies the first dimension of A as declared + + in the calling (sub) program. LDA must be at least + max( 1, n ). + Unchanged on exit. + + X - REAL array of dimension at least + ( 1 + ( n - 1 )*abs( INCX ) ). + Before entry, the incremented array X must contain the n + element right-hand side vector b. On exit, X is overwritten + + with the solution vector x. + + INCX - INTEGER. + On entry, INCX specifies the increment for the elements of + X. INCX must not be zero. + Unchanged on exit. + + + Level 2 Blas routine. + + -- Written on 22-October-1986. + Jack Dongarra, Argonne National Lab. + Jeremy Du Croz, Nag Central Office. + Sven Hammarling, Nag Central Office. + Richard Hanson, Sandia National Labs. + + + + Test the input parameters. + + + Parameter adjustments + Function Body */ +#define X(I) x[(I)-1] + +#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] + + info = 0; + if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { + info = 1; + } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && + ! lsame_(trans, "C")) { + info = 2; + } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*lda < ((1 > *n)? 1: *n)) { + info = 6; + } else if (*incx == 0) { + info = 8; + } + if (info != 0) { + xerbla_("STRSV ", &info); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + nounit = lsame_(diag, "N"); + +/* Set up the start point in X if the increment is not unity. This + will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N")) { + +/* Form x := inv( A )*x. */ + + if (lsame_(uplo, "U")) { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + if (X(j) != 0.f) { + if (nounit) { + X(j) /= A(j,j); + } + temp = X(j); + for (i = j - 1; i >= 1; --i) { + X(i) -= temp * A(i,j); +/* L10: */ + } + } +/* L20: */ + } + } else { + jx = kx + (*n - 1) * *incx; + for (j = *n; j >= 1; --j) { + if (X(jx) != 0.f) { + if (nounit) { + X(jx) /= A(j,j); + } + temp = X(jx); + ix = jx; + for (i = j - 1; i >= 1; --i) { + ix -= *incx; + X(ix) -= temp * A(i,j); +/* L30: */ + } + } + jx -= *incx; +/* L40: */ + } + } + } else { + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= *n; ++j) { + if (X(j) != 0.f) { + if (nounit) { + X(j) /= A(j,j); + } + temp = X(j); + i__2 = *n; + for (i = j + 1; i <= *n; ++i) { + X(i) -= temp * A(i,j); +/* L50: */ + } + } +/* L60: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= *n; ++j) { + if (X(jx) != 0.f) { + if (nounit) { + X(jx) /= A(j,j); + } + temp = X(jx); + ix = jx; + i__2 = *n; + for (i = j + 1; i <= *n; ++i) { + ix += *incx; + X(ix) -= temp * A(i,j); +/* L70: */ + } + } + jx += *incx; +/* L80: */ + } + } + } + } else { + +/* Form x := inv( A' )*x. */ + + if (lsame_(uplo, "U")) { + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= *n; ++j) { + temp = X(j); + i__2 = j - 1; + for (i = 1; i <= j-1; ++i) { + temp -= A(i,j) * X(i); +/* L90: */ + } + if (nounit) { + temp /= A(j,j); + } + X(j) = temp; +/* L100: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= *n; ++j) { + temp = X(jx); + ix = kx; + i__2 = j - 1; + for (i = 1; i <= j-1; ++i) { + temp -= A(i,j) * X(ix); + ix += *incx; +/* L110: */ + } + if (nounit) { + temp /= A(j,j); + } + X(jx) = temp; + jx += *incx; +/* L120: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + temp = X(j); + i__1 = j + 1; + for (i = *n; i >= j+1; --i) { + temp -= A(i,j) * X(i); +/* L130: */ + } + if (nounit) { + temp /= A(j,j); + } + X(j) = temp; +/* L140: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + temp = X(jx); + ix = kx; + i__1 = j + 1; + for (i = *n; i >= j+1; --i) { + temp -= A(i,j) * X(ix); + ix -= *incx; +/* L150: */ + } + if (nounit) { + temp /= A(j,j); + } + X(jx) = temp; + jx -= *incx; +/* L160: */ + } + } + } + } + + return 0; + +/* End of STRSV . */ + +} /* strsv_ */ + diff --git a/intern/opennl/superlu/superlu_timer.c b/intern/opennl/superlu/superlu_timer.c new file mode 100644 index 00000000000..798fd59d4ea --- /dev/null +++ b/intern/opennl/superlu/superlu_timer.c @@ -0,0 +1,55 @@ +/* + * Purpose + * ======= + * Returns the time in seconds used by the process. + * + * Note: the timer function call is machine dependent. Use conditional + * compilation to choose the appropriate function. + * + */ + +/* We want this flag, safer than putting in build system */ +#define NO_TIMER + +#ifdef SUN +/* + * It uses the system call gethrtime(3C), which is accurate to + * nanoseconds. +*/ +#include + +double SuperLU_timer_() { + return ( (double)gethrtime() / 1e9 ); +} + +#else + +#ifndef NO_TIMER +#include +#include +#include +#include +#endif + +#ifndef CLK_TCK +#define CLK_TCK 60 +#endif + +double SuperLU_timer_() +{ +#ifdef NO_TIMER + /* no sys/times.h on WIN32 */ + double tmp; + tmp = 0.0; +#else + struct tms use; + double tmp; + times(&use); + tmp = use.tms_utime; + tmp += use.tms_stime; +#endif + return (double)(tmp) / CLK_TCK; +} + +#endif + diff --git a/intern/opennl/superlu/supermatrix.h b/intern/opennl/superlu/supermatrix.h new file mode 100644 index 00000000000..665e22dc91f --- /dev/null +++ b/intern/opennl/superlu/supermatrix.h @@ -0,0 +1,140 @@ +#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */ +#define __SUPERLU_SUPERMATRIX + +/******************************************** + * The matrix types are defined as follows. * + ********************************************/ +typedef enum { + SLU_NC, /* column-wise, no supernode */ + SLU_NR, /* row-wize, no supernode */ + SLU_SC, /* column-wise, supernode */ + SLU_SR, /* row-wise, supernode */ + SLU_NCP, /* column-wise, column-permuted, no supernode + (The consecutive columns of nonzeros, after permutation, + may not be stored contiguously.) */ + SLU_DN /* Fortran style column-wise storage for dense matrix */ +} Stype_t; + +typedef enum { + SLU_S, /* single */ + SLU_D, /* double */ + SLU_C, /* single complex */ + SLU_Z /* double complex */ +} Dtype_t; + +typedef enum { + SLU_GE, /* general */ + SLU_TRLU, /* lower triangular, unit diagonal */ + SLU_TRUU, /* upper triangular, unit diagonal */ + SLU_TRL, /* lower triangular */ + SLU_TRU, /* upper triangular */ + SLU_SYL, /* symmetric, store lower half */ + SLU_SYU, /* symmetric, store upper half */ + SLU_HEL, /* Hermitian, store lower half */ + SLU_HEU /* Hermitian, store upper half */ +} Mtype_t; + +typedef struct { + Stype_t Stype; /* Storage type: interprets the storage structure + pointed to by *Store. */ + Dtype_t Dtype; /* Data type. */ + Mtype_t Mtype; /* Matrix type: describes the mathematical property of + the matrix. */ + int_t nrow; /* number of rows */ + int_t ncol; /* number of columns */ + void *Store; /* pointer to the actual storage of the matrix */ +} SuperMatrix; + +/*********************************************** + * The storage schemes are defined as follows. * + ***********************************************/ + +/* Stype == NC (Also known as Harwell-Boeing sparse matrix format) */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *rowind; /* pointer to array of row indices of the nonzeros */ + int_t *colptr; /* pointer to array of beginning of columns in nzval[] + and rowind[] */ + /* Note: + Zero-based indexing is used; + colptr[] has ncol+1 entries, the last one pointing + beyond the last column, so that colptr[ncol] = nnz. */ +} NCformat; + +/* Stype == NR (Also known as row compressed storage (RCS). */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + void *nzval; /* pointer to array of nonzero values, packed by row */ + int_t *colind; /* pointer to array of column indices of the nonzeros */ + int_t *rowptr; /* pointer to array of beginning of rows in nzval[] + and colind[] */ + /* Note: + Zero-based indexing is used; + nzval[] and colind[] are of the same length, nnz; + rowptr[] has nrow+1 entries, the last one pointing + beyond the last column, so that rowptr[nrow] = nnz. */ +} NRformat; + +/* Stype == SC */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + int_t nsuper; /* number of supernodes, minus 1 */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */ + int_t *rowind; /* pointer to array of compressed row indices of + rectangular supernodes */ + int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */ + int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column + j belongs; mapping from column to supernode number. */ + int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th + supernode; mapping from supernode number to column. + e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) + sup_to_col: 0 1 2 4 7 12 (nsuper=4) */ + /* Note: + Zero-based indexing is used; + nzval_colptr[], rowind_colptr[], col_to_sup and + sup_to_col[] have ncol+1 entries, the last one + pointing beyond the last column. + For col_to_sup[], only the first ncol entries are + defined. For sup_to_col[], only the first nsuper+2 + entries are defined. */ +} SCformat; + +/* Stype == NCP */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *rowind;/* pointer to array of row indices of the nonzeros */ + /* Note: nzval[]/rowind[] always have the same length */ + int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] + and rowind[] */ + int_t *colend;/* colend[j] points to one past the last element of column + j in nzval[] and rowind[] */ + /* Note: + Zero-based indexing is used; + The consecutive columns of the nonzeros may not be + contiguous in storage, because the matrix has been + postmultiplied by a column permutation matrix. */ +} NCPformat; + +/* Stype == DN */ +typedef struct { + int_t lda; /* leading dimension */ + void *nzval; /* array of size lda*ncol to represent a dense matrix */ +} DNformat; + + + +/********************************************************* + * Macros used for easy access of sparse matrix entries. * + *********************************************************/ +#define L_SUB_START(col) ( Lstore->rowind_colptr[col] ) +#define L_SUB(ptr) ( Lstore->rowind[ptr] ) +#define L_NZ_START(col) ( Lstore->nzval_colptr[col] ) +#define L_FST_SUPC(superno) ( Lstore->sup_to_col[superno] ) +#define U_NZ_START(col) ( Ustore->colptr[col] ) +#define U_SUB(ptr) ( Ustore->rowind[ptr] ) + + +#endif /* __SUPERLU_SUPERMATRIX */ diff --git a/intern/opennl/superlu/sutil.c b/intern/opennl/superlu/sutil.c new file mode 100644 index 00000000000..4689f34968a --- /dev/null +++ b/intern/opennl/superlu/sutil.c @@ -0,0 +1,478 @@ + +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include +#include "ssp_defs.h" + +void +sCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, + float *nzval, int *rowind, int *colptr, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + NCformat *Astore; + + A->Stype = stype; + A->Dtype = dtype; + A->Mtype = mtype; + A->nrow = m; + A->ncol = n; + A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); + if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); + Astore = A->Store; + Astore->nnz = nnz; + Astore->nzval = nzval; + Astore->rowind = rowind; + Astore->colptr = colptr; +} + +void +sCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, + float *nzval, int *colind, int *rowptr, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + NRformat *Astore; + + A->Stype = stype; + A->Dtype = dtype; + A->Mtype = mtype; + A->nrow = m; + A->ncol = n; + A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); + if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); + Astore = A->Store; + Astore->nnz = nnz; + Astore->nzval = nzval; + Astore->colind = colind; + Astore->rowptr = rowptr; +} + +/* Copy matrix A into matrix B. */ +void +sCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) +{ + NCformat *Astore, *Bstore; + int ncol, nnz, i; + + B->Stype = A->Stype; + B->Dtype = A->Dtype; + B->Mtype = A->Mtype; + B->nrow = A->nrow;; + B->ncol = ncol = A->ncol; + Astore = (NCformat *) A->Store; + Bstore = (NCformat *) B->Store; + Bstore->nnz = nnz = Astore->nnz; + for (i = 0; i < nnz; ++i) + ((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i]; + for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; + for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; +} + + +void +sCreate_Dense_Matrix(SuperMatrix *X, int m, int n, float *x, int ldx, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + DNformat *Xstore; + + X->Stype = stype; + X->Dtype = dtype; + X->Mtype = mtype; + X->nrow = m; + X->ncol = n; + X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); + Xstore = (DNformat *) X->Store; + Xstore->lda = ldx; + Xstore->nzval = (float *) x; +} + +void +sCopy_Dense_Matrix(int M, int N, float *X, int ldx, + float *Y, int ldy) +{ +/* + * + * Purpose + * ======= + * + * Copies a two-dimensional matrix X to another matrix Y. + */ + int i, j; + + for (j = 0; j < N; ++j) + for (i = 0; i < M; ++i) + Y[i + j*ldy] = X[i + j*ldx]; +} + +void +sCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, + float *nzval, int *nzval_colptr, int *rowind, + int *rowind_colptr, int *col_to_sup, int *sup_to_col, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + SCformat *Lstore; + + L->Stype = stype; + L->Dtype = dtype; + L->Mtype = mtype; + L->nrow = m; + L->ncol = n; + L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); + if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); + Lstore = L->Store; + Lstore->nnz = nnz; + Lstore->nsuper = col_to_sup[n]; + Lstore->nzval = nzval; + Lstore->nzval_colptr = nzval_colptr; + Lstore->rowind = rowind; + Lstore->rowind_colptr = rowind_colptr; + Lstore->col_to_sup = col_to_sup; + Lstore->sup_to_col = sup_to_col; + +} + + +/* + * Convert a row compressed storage into a column compressed storage. + */ +void +sCompRow_to_CompCol(int m, int n, int nnz, + float *a, int *colind, int *rowptr, + float **at, int **rowind, int **colptr) +{ + register int i, j, col, relpos; + int *marker; + + /* Allocate storage for another copy of the matrix. */ + *at = (float *) floatMalloc(nnz); + *rowind = (int *) intMalloc(nnz); + *colptr = (int *) intMalloc(n+1); + marker = (int *) intCalloc(n); + + /* Get counts of each column of A, and set up column pointers */ + for (i = 0; i < m; ++i) + for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; + (*colptr)[0] = 0; + for (j = 0; j < n; ++j) { + (*colptr)[j+1] = (*colptr)[j] + marker[j]; + marker[j] = (*colptr)[j]; + } + + /* Transfer the matrix into the compressed column storage. */ + for (i = 0; i < m; ++i) { + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + col = colind[j]; + relpos = marker[col]; + (*rowind)[relpos] = i; + (*at)[relpos] = a[j]; + ++marker[col]; + } + } + + SUPERLU_FREE(marker); +} + + +void +sPrint_CompCol_Matrix(char *what, SuperMatrix *A) +{ + NCformat *Astore; + register int i,n; + float *dp; + + printf("\nCompCol matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + n = A->ncol; + Astore = (NCformat *) A->Store; + dp = (float *) Astore->nzval; + printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); + printf("nzval: "); + for (i = 0; i < Astore->colptr[n]; ++i) printf("%f ", dp[i]); + printf("\nrowind: "); + for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); + printf("\ncolptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); + printf("\n"); + fflush(stdout); +} + +void +sPrint_SuperNode_Matrix(char *what, SuperMatrix *A) +{ + SCformat *Astore; + register int i, j, k, c, d, n, nsup; + float *dp; + int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; + + printf("\nSuperNode matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + n = A->ncol; + Astore = (SCformat *) A->Store; + dp = (float *) Astore->nzval; + col_to_sup = Astore->col_to_sup; + sup_to_col = Astore->sup_to_col; + rowind_colptr = Astore->rowind_colptr; + rowind = Astore->rowind; + printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", + A->nrow,A->ncol,Astore->nnz,Astore->nsuper); + printf("nzval:\n"); + for (k = 0; k <= Astore->nsuper; ++k) { + c = sup_to_col[k]; + nsup = sup_to_col[k+1] - c; + for (j = c; j < c + nsup; ++j) { + d = Astore->nzval_colptr[j]; + for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { + printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]); + } + } + } +#if 0 + for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); +#endif + printf("\nnzval_colptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); + printf("\nrowind: "); + for (i = 0; i < Astore->rowind_colptr[n]; ++i) + printf("%d ", Astore->rowind[i]); + printf("\nrowind_colptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); + printf("\ncol_to_sup: "); + for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); + printf("\nsup_to_col: "); + for (i = 0; i <= Astore->nsuper+1; ++i) + printf("%d ", sup_to_col[i]); + printf("\n"); + fflush(stdout); +} + +void +sPrint_Dense_Matrix(char *what, SuperMatrix *A) +{ + DNformat *Astore; + register int i; + float *dp; + + printf("\nDense matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + Astore = (DNformat *) A->Store; + dp = (float *) Astore->nzval; + printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,Astore->lda); + printf("\nnzval: "); + for (i = 0; i < A->nrow; ++i) printf("%f ", dp[i]); + printf("\n"); + fflush(stdout); +} + +/* + * Diagnostic print of column "jcol" in the U/L factor. + */ +void +sprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) +{ + int i, k, fsupc; + int *xsup, *supno; + int *xlsub, *lsub; + float *lusup; + int *xlusup; + float *ucol; + int *usub, *xusub; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + + printf("%s", msg); + printf("col %d: pivrow %d, supno %d, xprune %d\n", + jcol, pivrow, supno[jcol], xprune[jcol]); + + printf("\tU-col:\n"); + for (i = xusub[jcol]; i < xusub[jcol+1]; i++) + printf("\t%d%10.4f\n", usub[i], ucol[i]); + printf("\tL-col in rectangular snode:\n"); + fsupc = xsup[supno[jcol]]; /* first col of the snode */ + i = xlsub[fsupc]; + k = xlusup[jcol]; + while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { + printf("\t%d\t%10.4f\n", lsub[i], lusup[k]); + i++; k++; + } + fflush(stdout); +} + + +/* + * Check whether tempv[] == 0. This should be true before and after + * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". + */ +void scheck_tempv(int n, float *tempv) +{ + int i; + + for (i = 0; i < n; i++) { + if (tempv[i] != 0.0) + { + fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]); + ABORT("scheck_tempv"); + } + } +} + + +void +sGenXtrue(int n, int nrhs, float *x, int ldx) +{ + int i, j; + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + x[i + j*ldx] = 1.0;/* + (float)(i+1.)/n;*/ + } +} + +/* + * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's + */ +void +sFillRHS(trans_t trans, int nrhs, float *x, int ldx, + SuperMatrix *A, SuperMatrix *B) +{ + NCformat *Astore; + float *Aval; + DNformat *Bstore; + float *rhs; + float one = 1.0; + float zero = 0.0; + int ldc; + char transc[1]; + + Astore = A->Store; + Aval = (float *) Astore->nzval; + Bstore = B->Store; + rhs = Bstore->nzval; + ldc = Bstore->lda; + + if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; + else *(unsigned char *)transc = 'T'; + + sp_sgemm(transc, nrhs, one, A, + x, ldx, zero, rhs, ldc); + +} + +/* + * Fills a float precision array with a given value. + */ +void +sfill(float *a, int alen, float dval) +{ + register int i; + for (i = 0; i < alen; i++) a[i] = dval; +} + + + +/* + * Check the inf-norm of the error vector + */ +void sinf_norm_error(int nrhs, SuperMatrix *X, float *xtrue) +{ + DNformat *Xstore; + float err, xnorm; + float *Xmat, *soln_work; + int i, j; + + Xstore = X->Store; + Xmat = Xstore->nzval; + + for (j = 0; j < nrhs; j++) { + soln_work = &Xmat[j*Xstore->lda]; + err = xnorm = 0.0; + for (i = 0; i < X->nrow; i++) { + err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i])); + } + err = err / xnorm; + printf("||X - Xtrue||/||X|| = %e\n", err); + } +} + + + +/* Print performance of the code. */ +void +sPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, + float rpg, float rcond, float *ferr, + float *berr, char *equed, SuperLUStat_t *stat) +{ + SCformat *Lstore; + NCformat *Ustore; + double *utime; + flops_t *ops; + + utime = stat->utime; + ops = stat->ops; + + if ( utime[FACT] != 0. ) + printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], + ops[FACT]*1e-6/utime[FACT]); + printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); + if ( utime[SOLVE] != 0. ) + printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], + ops[SOLVE]*1e-6/utime[SOLVE]); + + Lstore = (SCformat *) L->Store; + Ustore = (NCformat *) U->Store; + printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); + printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); + printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); + + printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n", + mem_usage->for_lu/1e6, mem_usage->total_needed/1e6, + mem_usage->expansions); + + printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); + printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", + utime[FACT], ops[FACT]*1e-6/utime[FACT], + utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], + utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); + + printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); + printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", + rpg, rcond, ferr[0], berr[0], equed); + +} + + + + +int print_float_vec(char *what, int n, float *vec) +{ + int i; + printf("%s: n %d\n", what, n); + for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]); + return 0; +} + diff --git a/intern/opennl/superlu/util.c b/intern/opennl/superlu/util.c new file mode 100644 index 00000000000..3c49d714d1c --- /dev/null +++ b/intern/opennl/superlu/util.c @@ -0,0 +1,391 @@ +/* + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + */ +/* + Copyright (c) 1994 by Xerox Corporation. All rights reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use or copy this program for any + purpose, provided the above notices are retained on all copies. + Permission to modify the code and to distribute modified code is + granted, provided the above notices are retained, and a notice that + the code was modified is included with the above copyright notice. +*/ + +#include +#include "ssp_defs.h" +#include "util.h" + +/* + * Global statistics variale + */ + +void superlu_abort_and_exit(char* msg) +{ + fprintf(stderr, msg); + exit (-1); +} + +/* + * Set the default values for the options argument. + */ +void set_default_options(superlu_options_t *options) +{ + options->Fact = DOFACT; + options->Equil = YES; + options->ColPerm = COLAMD; + options->DiagPivotThresh = 1.0; + options->Trans = NOTRANS; + options->IterRefine = NOREFINE; + options->SymmetricMode = NO; + options->PivotGrowth = NO; + options->ConditionNumber = NO; + options->PrintStat = YES; +} + +/* Deallocate the structure pointing to the actual storage of the matrix. */ +void +Destroy_SuperMatrix_Store(SuperMatrix *A) +{ + SUPERLU_FREE ( A->Store ); +} + +void +Destroy_CompCol_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE( ((NCformat *)A->Store)->rowind ); + SUPERLU_FREE( ((NCformat *)A->Store)->colptr ); + SUPERLU_FREE( ((NCformat *)A->Store)->nzval ); + SUPERLU_FREE( A->Store ); +} + +void +Destroy_CompRow_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE( ((NRformat *)A->Store)->colind ); + SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); + SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); + SUPERLU_FREE( A->Store ); +} + +void +Destroy_SuperNode_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); + SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); + SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); + SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); + SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); + SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); + SUPERLU_FREE ( A->Store ); +} + +/* A is of type Stype==NCP */ +void +Destroy_CompCol_Permuted(SuperMatrix *A) +{ + SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); + SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); + SUPERLU_FREE ( A->Store ); +} + +/* A is of type Stype==DN */ +void +Destroy_Dense_Matrix(SuperMatrix *A) +{ + DNformat* Astore = A->Store; + SUPERLU_FREE (Astore->nzval); + SUPERLU_FREE ( A->Store ); +} + +/* + * Reset repfnz[] for the current column + */ +void +resetrep_col (const int nseg, const int *segrep, int *repfnz) +{ + int i, irep; + + for (i = 0; i < nseg; i++) { + irep = segrep[i]; + repfnz[irep] = EMPTY; + } +} + + +/* + * Count the total number of nonzeros in factors L and U, and in the + * symmetrically reduced L. + */ +void +countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLU_t *Glu) +{ + int nsuper, fsupc, i, j; + int nnzL0, jlen, irep; + int *xsup, *xlsub; + + xsup = Glu->xsup; + xlsub = Glu->xlsub; + *nnzL = 0; + *nnzU = (Glu->xusub)[n]; + nnzL0 = 0; + nsuper = (Glu->supno)[n]; + + if ( n <= 0 ) return; + + /* + * For each supernode + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jlen = xlsub[fsupc+1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i+1]; j++) { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i+1] - 1; + nnzL0 += xprune[irep] - xlsub[irep]; + } + + /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/ +} + + + +/* + * Fix up the data storage lsub for L-subscripts. It removes the subscript + * sets for structural pruning, and applies permuation to the remaining + * subscripts. + */ +void +fixupL(const int n, const int *perm_r, GlobalLU_t *Glu) +{ + register int nsuper, fsupc, nextl, i, j, k, jstrt; + int *xsup, *lsub, *xlsub; + + if ( n <= 1 ) return; + + xsup = Glu->xsup; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nextl = 0; + nsuper = (Glu->supno)[n]; + + /* + * For each supernode ... + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jstrt = xlsub[fsupc]; + xlsub[fsupc] = nextl; + for (j = jstrt; j < xlsub[fsupc+1]; j++) { + lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ + nextl++; + } + for (k = fsupc+1; k < xsup[i+1]; k++) + xlsub[k] = nextl; /* Other columns in supernode i */ + + } + + xlsub[n] = nextl; +} + + +/* + * Diagnostic print of segment info after panel_dfs(). + */ +void print_panel_seg(int n, int w, int jcol, int nseg, + int *segrep, int *repfnz) +{ + int j, k; + + for (j = jcol; j < jcol+w; j++) { + printf("\tcol %d:\n", j); + for (k = 0; k < nseg; k++) + printf("\t\tseg %d, segrep %d, repfnz %d\n", k, + segrep[k], repfnz[(j-jcol)*n + segrep[k]]); + } + +} + + +void +StatInit(SuperLUStat_t *stat) +{ + register int i, w, panel_size, relax; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + w = SUPERLU_MAX(panel_size, relax); + stat->panel_histo = intCalloc(w+1); + stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double)); + if (!stat->utime) ABORT("SUPERLU_MALLOC fails for stat->utime"); + stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); + if (!stat->ops) ABORT("SUPERLU_MALLOC fails for stat->ops"); + for (i = 0; i < NPHASES; ++i) { + stat->utime[i] = 0.; + stat->ops[i] = 0.; + } +} + + +void +StatPrint(SuperLUStat_t *stat) +{ + double *utime; + flops_t *ops; + + utime = stat->utime; + ops = stat->ops; + printf("Factor time = %8.2f\n", utime[FACT]); + if ( utime[FACT] != 0.0 ) + printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], + ops[FACT]*1e-6/utime[FACT]); + + printf("Solve time = %8.2f\n", utime[SOLVE]); + if ( utime[SOLVE] != 0.0 ) + printf("Solve flops = %e\tMflops = %8.2f\n", ops[SOLVE], + ops[SOLVE]*1e-6/utime[SOLVE]); + +} + + +void +StatFree(SuperLUStat_t *stat) +{ + SUPERLU_FREE(stat->panel_histo); + SUPERLU_FREE(stat->utime); + SUPERLU_FREE(stat->ops); +} + + +flops_t +LUFactFlops(SuperLUStat_t *stat) +{ + return (stat->ops[FACT]); +} + +flops_t +LUSolveFlops(SuperLUStat_t *stat) +{ + return (stat->ops[SOLVE]); +} + + + + + +/* + * Fills an integer array with a given value. + */ +void ifill(int *a, int alen, int ival) +{ + register int i; + for (i = 0; i < alen; i++) a[i] = ival; +} + + + +/* + * Get the statistics of the supernodes + */ +#define NBUCKS 10 +static int max_sup_size; + +void super_stats(int nsuper, int *xsup) +{ + register int nsup1 = 0; + int i, isize, whichb, bl, bh; + int bucket[NBUCKS]; + + max_sup_size = 0; + + for (i = 0; i <= nsuper; i++) { + isize = xsup[i+1] - xsup[i]; + if ( isize == 1 ) nsup1++; + if ( max_sup_size < isize ) max_sup_size = isize; + } + + printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1); + printf("\tmax supernode size = %d\n", max_sup_size); + printf("\tno of size 1 supernodes = %d\n", nsup1); + + /* Histogram of the supernode sizes */ + ifill (bucket, NBUCKS, 0); + + for (i = 0; i <= nsuper; i++) { + isize = xsup[i+1] - xsup[i]; + whichb = (float) isize / max_sup_size * NBUCKS; + if (whichb >= NBUCKS) whichb = NBUCKS - 1; + bucket[whichb]++; + } + + printf("\tHistogram of supernode sizes:\n"); + for (i = 0; i < NBUCKS; i++) { + bl = (float) i * max_sup_size / NBUCKS; + bh = (float) (i+1) * max_sup_size / NBUCKS; + printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]); + } + +} + + +float SpaSize(int n, int np, float sum_npw) +{ + return (sum_npw*8 + np*8 + n*4)/1024.; +} + +float DenseSize(int n, float sum_nw) +{ + return (sum_nw*8 + n*8)/1024.;; +} + + + +/* + * Check whether repfnz[] == EMPTY after reset. + */ +void check_repfnz(int n, int w, int jcol, int *repfnz) +{ + int jj, k; + + for (jj = jcol; jj < jcol+w; jj++) + for (k = 0; k < n; k++) + if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { + fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, + k, repfnz[(jj-jcol)*n + k]); + ABORT("check_repfnz"); + } +} + + +/* Print a summary of the testing results. */ +void +PrintSumm(char *type, int nfail, int nrun, int nerrs) +{ + if ( nfail > 0 ) + printf("%3s driver: %d out of %d tests failed to pass the threshold\n", + type, nfail, nrun); + else + printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun); + + if ( nerrs > 0 ) + printf("%6d error messages recorded\n", nerrs); +} + + +int print_int_vec(char *what, int n, int *vec) +{ + int i; + printf("%s\n", what); + for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]); + return 0; +} diff --git a/intern/opennl/superlu/util.h b/intern/opennl/superlu/util.h new file mode 100644 index 00000000000..1a3526d4e7e --- /dev/null +++ b/intern/opennl/superlu/util.h @@ -0,0 +1,267 @@ +#ifndef __SUPERLU_UTIL /* allow multiple inclusions */ +#define __SUPERLU_UTIL + +#include +#include +#include +/* +#ifndef __STDC__ +#include +#endif +*/ +#include + +/*********************************************************************** + * Macros + ***********************************************************************/ +#define FIRSTCOL_OF_SNODE(i) (xsup[i]) +/* No of marker arrays used in the symbolic factorization, + each of size n */ +#define NO_MARKER 3 +#define NUM_TEMPV(m,w,t,b) ( SUPERLU_MAX(m, (t + b)*w) ) + +#ifndef USER_ABORT +#define USER_ABORT(msg) superlu_abort_and_exit(msg) +#endif + +#define ABORT(err_msg) \ + { char msg[256];\ + sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\ + USER_ABORT(msg); } + + +#ifndef USER_MALLOC +#if 1 +#define USER_MALLOC(size) superlu_malloc(size) +#else +/* The following may check out some uninitialized data */ +#define USER_MALLOC(size) memset (superlu_malloc(size), '\x0F', size) +#endif +#endif + +#define SUPERLU_MALLOC(size) USER_MALLOC(size) + +#ifndef USER_FREE +#define USER_FREE(addr) superlu_free(addr) +#endif + +#define SUPERLU_FREE(addr) USER_FREE(addr) + +#define CHECK_MALLOC(where) { \ + extern int superlu_malloc_total; \ + printf("%s: malloc_total %d Bytes\n", \ + where, superlu_malloc_total); \ +} + +#define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) ) +#define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) ) + +/*********************************************************************** + * Constants + ***********************************************************************/ +#define EMPTY (-1) +/*#define NO (-1)*/ +#define FALSE 0 +#define TRUE 1 + +/*********************************************************************** + * Enumerate types + ***********************************************************************/ +typedef enum {NO, YES} yes_no_t; +typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t; +typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t; +typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, MY_PERMC}colperm_t; +typedef enum {NOTRANS, TRANS, CONJ} trans_t; +typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t; +typedef enum {NOREFINE, SINGLE=1, SLU_DOUBLE, EXTRA} IterRefine_t; +typedef enum {LUSUP, UCOL, LSUB, USUB} MemType; +typedef enum {HEAD, TAIL} stack_end_t; +typedef enum {SYSTEM, USER} LU_space_t; + +/* + * The following enumerate type is used by the statistics variable + * to keep track of flop count and time spent at various stages. + * + * Note that not all of the fields are disjoint. + */ +typedef enum { + COLPERM, /* find a column ordering that minimizes fills */ + RELAX, /* find artificial supernodes */ + ETREE, /* compute column etree */ + EQUIL, /* equilibrate the original matrix */ + FACT, /* perform LU factorization */ + RCOND, /* estimate reciprocal condition number */ + SOLVE, /* forward and back solves */ + REFINE, /* perform iterative refinement */ + SLU_FLOAT, /* time spent in floating-point operations */ + TRSV, /* fraction of FACT spent in xTRSV */ + GEMV, /* fraction of FACT spent in xGEMV */ + FERR, /* estimate error bounds after iterative refinement */ + NPHASES /* total number of phases */ +} PhaseType; + + +/*********************************************************************** + * Type definitions + ***********************************************************************/ +typedef float flops_t; +typedef unsigned char Logical; + +/* + *-- This contains the options used to control the solve process. + * + * Fact (fact_t) + * Specifies whether or not the factored form of the matrix + * A is supplied on entry, and if not, how the matrix A should + * be factorizaed. + * = DOFACT: The matrix A will be factorized from scratch, and the + * factors will be stored in L and U. + * = SamePattern: The matrix A will be factorized assuming + * that a factorization of a matrix with the same sparsity + * pattern was performed prior to this one. Therefore, this + * factorization will reuse column permutation vector + * ScalePermstruct->perm_c and the column elimination tree + * LUstruct->etree. + * = SamePattern_SameRowPerm: The matrix A will be factorized + * assuming that a factorization of a matrix with the same + * sparsity pattern and similar numerical values was performed + * prior to this one. Therefore, this factorization will reuse + * both row and column scaling factors R and C, and the + * both row and column permutation vectors perm_r and perm_c, + * distributed data structure set up from the previous symbolic + * factorization. + * = FACTORED: On entry, L, U, perm_r and perm_c contain the + * factored form of A. If DiagScale is not NOEQUIL, the matrix + * A has been equilibrated with scaling factors R and C. + * + * Equil (yes_no_t) + * Specifies whether to equilibrate the system (scale A's row and + * columns to have unit norm). + * + * ColPerm (colperm_t) + * Specifies what type of column permutation to use to reduce fill. + * = NATURAL: use the natural ordering + * = MMD_ATA: use minimum degree ordering on structure of A'*A + * = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A + * = COLAMD: use approximate minimum degree column ordering + * = MY_PERMC: use the ordering specified in ScalePermstruct->perm_c[] + * + * Trans (trans_t) + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A**T * X = B (Transpose) + * = CONJ: A**H * X = B (Transpose) + * + * IterRefine (IterRefine_t) + * Specifies whether to perform iterative refinement. + * = NO: no iterative refinement + * = WorkingPrec: perform iterative refinement in working precision + * = ExtraPrec: perform iterative refinement in extra precision + * + * PrintStat (yes_no_t) + * Specifies whether to print the solver's statistics. + * + * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU) + * Specifies the threshold used for a diagonal entry to be an + * acceptable pivot. + * + * PivotGrowth (yes_no_t) + * Specifies whether to compute the reciprocal pivot growth. + * + * ConditionNumber (ues_no_t) + * Specifies whether to compute the reciprocal condition number. + * + * RowPerm (rowperm_t) (only for SuperLU_DIST) + * Specifies whether to permute rows of the original matrix. + * = NO: not to permute the rows + * = LargeDiag: make the diagonal large relative to the off-diagonal + * = MY_PERMR: use the permutation given in ScalePermstruct->perm_r[] + * + * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST) + * Specifies whether to replace the tiny diagonals by + * sqrt(epsilon)*||A|| during LU factorization. + * + * SolveInitialized (yes_no_t) (only for SuperLU_DIST) + * Specifies whether the initialization has been performed to the + * triangular solve. + * + * RefineInitialized (yes_no_t) (only for SuperLU_DIST) + * Specifies whether the initialization has been performed to the + * sparse matrix-vector multiplication routine needed in iterative + * refinement. + */ +typedef struct { + fact_t Fact; + yes_no_t Equil; + colperm_t ColPerm; + trans_t Trans; + IterRefine_t IterRefine; + yes_no_t PrintStat; + yes_no_t SymmetricMode; + double DiagPivotThresh; + yes_no_t PivotGrowth; + yes_no_t ConditionNumber; + rowperm_t RowPerm; + yes_no_t ReplaceTinyPivot; + yes_no_t SolveInitialized; + yes_no_t RefineInitialized; +} superlu_options_t; + +typedef struct { + int *panel_histo; /* histogram of panel size distribution */ + double *utime; /* running time at various phases */ + flops_t *ops; /* operation count at various phases */ + int TinyPivots; /* number of tiny pivots */ + int RefineSteps; /* number of iterative refinement steps */ +} SuperLUStat_t; + + +/*********************************************************************** + * Prototypes + ***********************************************************************/ +#ifdef __cplusplus +extern "C" { +#endif + +extern void Destroy_SuperMatrix_Store(SuperMatrix *); +extern void Destroy_CompCol_Matrix(SuperMatrix *); +extern void Destroy_CompRow_Matrix(SuperMatrix *); +extern void Destroy_SuperNode_Matrix(SuperMatrix *); +extern void Destroy_CompCol_Permuted(SuperMatrix *); +extern void Destroy_Dense_Matrix(SuperMatrix *); +extern void get_perm_c(int, SuperMatrix *, int *); +extern void set_default_options(superlu_options_t *options); +extern void sp_preorder (superlu_options_t *, SuperMatrix*, int*, int*, + SuperMatrix*); +extern void superlu_abort_and_exit(char*); +extern void *superlu_malloc (size_t); +extern int *intMalloc (int); +extern int *intCalloc (int); +extern void superlu_free (void*); +extern void SetIWork (int, int, int, int *, int **, int **, int **, + int **, int **, int **, int **); +extern int sp_coletree (int *, int *, int *, int, int, int *); +extern void relax_snode (const int, int *, const int, int *, int *); +extern void heap_relax_snode (const int, int *, const int, int *, int *); +extern void resetrep_col (const int, const int *, int *); +extern int spcoletree (int *, int *, int *, int, int, int *); +extern int *TreePostorder (int, int *); +extern double SuperLU_timer_ (); +extern int sp_ienv (int); +extern int lsame_ (char *, char *); +extern int xerbla_ (char *, int *); +extern void ifill (int *, int, int); +extern void snode_profile (int, int *); +extern void super_stats (int, int *); +extern void PrintSumm (char *, int, int, int); +extern void StatInit(SuperLUStat_t *); +extern void StatPrint (SuperLUStat_t *); +extern void StatFree(SuperLUStat_t *); +extern void print_panel_seg(int, int, int, int, int *, int *); +extern void check_repfnz(int, int, int, int *); + +#ifdef __cplusplus + } +#endif + +#endif /* __SUPERLU_UTIL */ diff --git a/intern/opennl/superlu/xerbla.c b/intern/opennl/superlu/xerbla.c new file mode 100644 index 00000000000..cb94fa71d95 --- /dev/null +++ b/intern/opennl/superlu/xerbla.c @@ -0,0 +1,43 @@ + +#include + +/* Subroutine */ int xerbla_(char *srname, int *info) +{ +/* -- LAPACK auxiliary routine (version 2.0) -- + Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + Courant Institute, Argonne National Lab, and Rice University + September 30, 1994 + + + Purpose + ======= + + XERBLA is an error handler for the LAPACK routines. + It is called by an LAPACK routine if an input parameter has an + invalid value. A message is printed and execution stops. + + Installers may consider modifying the STOP statement in order to + call system-specific exception-handling facilities. + + Arguments + ========= + + SRNAME (input) CHARACTER*6 + The name of the routine which called XERBLA. + + INFO (input) INT + The position of the invalid parameter in the parameter list + + of the calling routine. + + ===================================================================== +*/ + + printf("** On entry to %6s, parameter number %2d had an illegal value\n", + srname, *info); + +/* End of XERBLA */ + + return 0; +} /* xerbla_ */ + diff --git a/source/Makefile b/source/Makefile index a952fd35274..ccf4bb53043 100644 --- a/source/Makefile +++ b/source/Makefile @@ -87,6 +87,8 @@ PYPLAYERLIB ?= $(PYLIB) GRPLIB += $(OCGDIR)/blender/renderconverter/$(DEBUG_DIR)librenderconverter.a GRPLIB += $(OCGDIR)/blender/render/$(DEBUG_DIR)librender.a GRPLIB += $(OCGDIR)/blender/radiosity/$(DEBUG_DIR)libradiosity.a + GRPLIB += $(NAN_OPENNL)/lib/$(DEBUG_DIR)libopennl.a + GRPLIB += $(NAN_SUPERLU)/lib/$(DEBUG_DIR)libsuperlu.a GRPLIB += $(OCGDIR)/blender/python/$(DEBUG_DIR)libpython.a diff --git a/source/nan_definitions.mk b/source/nan_definitions.mk index 6156203d87b..697c9093d58 100644 --- a/source/nan_definitions.mk +++ b/source/nan_definitions.mk @@ -81,6 +81,8 @@ endif export NAN_GHOST ?= $(LCGDIR)/ghost export NAN_TEST_VERBOSITY ?= 1 export NAN_BMFONT ?= $(LCGDIR)/bmfont + export NAN_OPENNL ?= $(LCGDIR)/opennl + export NAN_SUPERLU ?= $(LCGDIR)/superlu ifeq ($(FREE_WINDOWS), true) export NAN_FTGL ?= $(LCGDIR)/gcc/ftgl else