Community
Zenoss Newsletter

Monitored by Zenoss
SourceForge.net Logo
Views

At least in version 2.1.1, zenwin, zenwinmodeler, and zeneventlog have 
(IMO) a critical defect: if there are any /Status/WMI/Conn issues not in 
history for the device, they ignore the device. On our network, for some 
reason we end up with a lot of these events ('timegenerated' errors, 
various intermittent failures to connect, etc.). This causes the 
monitoring of our Windows servers to dramatically fall off as the system 
runs, and we miss critical issues. 

I changed the behavior of these three systems to go ahead and attempt 
monitoring even if WMI issues are encountered. I learned that most of 
the time these WMI issues are spurious and successful monitoring CAN 
still be attempted. If you use this code, I recommend combining it with 
event commands to restart the zenoss daemons when it finds them dead. 

Also, in zenwin, I added/improved the exception handling; a failure to 
create the watcher object occurs outside of a try block. Much of this 
code is an attempt to keep zenwin from crashing if it tries to monitor a 
Windows Server 2008 machine (Zenoss is not compatible with WS 2008 or 
Vista's WMI interface, and zenwin cannot monitor services on these 
devices). I ended up adding a hardcoded exclusion list so I can 
otherwise monitor the machine but have zenwin skip it. For some reason, 
zeneventlog seems to not crash, although it is not able to retrieve 
events from the WS 2008 machine either. 

I'd post the diff for the zeneventlog.py code as well but it's more 
complicated and would require a bit more explanation. 



$ZENHOME/Products/ZenWin/zenwinmodeler.py:
@@ -57,8 +57,11 @@
                 continue
             try:
                 if name in self.wmiprobs:
+                    """ 2007-12-14 - Attempt collection anyway as LOTS of devices have bad WMI states...
                     self.log.warn("skipping %s has bad wmi state", name)
                     continue
+                    """
+                    self.log.warn("%s has bad wmi state, processing anyway", name)
                 self.log.info("collecting from %s using user %s", name, user)
                 svcs = self.getServices(name, ip, user, passwd)
                 if not svcs:
@@ -74,7 +77,7 @@
             except pywintypes.com_error, e:
                 msg = self.printComErrorMessage(e)
                 if not msg:
-                    msg = "WMI connect error on %s: %s" % (name)
+                    msg = "WMI connect error on %s: " % (name)
                     code, txt, info, param = e
                     wmsg = "%s: %s" % (abs(code), txt)
                     if info:





$ZENHOME/Products/ZenWin/zenwin.py:
@@ -44,6 +44,7 @@

     def __init__(self):
         Base.__init__(self)
+        self.excludeDevices = []
         self.wmiprobs = []
         self.devices = []
         self.watchers = {}
@@ -95,30 +96,42 @@
             return None
         wql = "select Name from Win32_Service where State='Running'"
         w = self._wmi(srec)
-        w.connect()
-        svcs = [ svc.Name for svc in w.query(wql) ]
-        for name, (status, severity) in srec.services.items():
-            self.log.debug("service: %s status: %d", name, status)
-            if name not in svcs:
-                self.serviceStopped(srec, name)
-            elif status > 0:
-                self.serviceRunning(srec, name)
-        w.close()
+        try:
+            w.connect()
+            svcs = [ svc.Name for svc in w.query(wql) ]
+            for name, (status, severity) in srec.services.items():
+                self.log.debug("service: %s status: %d", name, status)
+                if name not in svcs:
+                    self.serviceStopped(srec, name)
+                elif status > 0:
+                    self.serviceRunning(srec, name)
+            w.close()
+        except Exception, ex:
+            self.log.warn("scanDevice(%s): %s", srec.name, str(ex))
+            self.closeWatcher(w)
+            raise

     def getWatcher(self, srec):
         wql = ("""SELECT * FROM __InstanceModificationEvent within 5 where """
                """TargetInstance ISA 'Win32_Service' """)
-        w = self._wmi(srec)
-        w.connect()
+        try:
+            w = self._wmi(srec)
+            w.connect()
+        except Exception, ex:
+            self.log.warn("getWatcher(%s): %s", srec.name, str(ex))
+            self.closeWatcher(w)
+            raise
         return w.watcher(wql)

     def processDevice(self, srec):
-        w = self.watchers.get(srec.name, None)
-        if not w:
-            self.scanDevice(srec)
-            self.deviceUp(srec)
-            self.watchers[srec.name] = w = self.getWatcher(srec)
+        self.log.info("Processing %s" % srec.name)
         try:
+            w = self.watchers.get(srec.name, None)
+            if not w:
+                self.log.info("Initializing %s, no existing watcher" % srec.name)
+                self.scanDevice(srec)
+                self.deviceUp(srec)
+                self.watchers[srec.name] = w = self.getWatcher(srec)
             self.log.debug("Querying %s", srec.name)
             s = w.nextEvent(100)
             self.deviceUp(srec)
@@ -137,12 +150,33 @@
                 self.log.debug("Codes: %r %r %r %r %r %r" % info)
                 scode = abs(scode)
             if scode != TIMEOUT_CODE:
+                self.log.warn("Codes: %r %r %r %r %r %r" % info)
                 self.deviceDown(srec, '%d: %s' % (code, txt))
+        except Exception, ex:
+            self.log.warn("processDevice(%s): %s", srec.name, str(ex))
+            self.deviceDown(srec, str(ex))
+
+    def closeWatcher(self, w, name=None):
+        try:
+            if w:
+                w.close()
+                w = None
+            if name and name in self.watchers:
+                del self.watchers[name]
+        except Exception, ex:
+            if not name: name = 'None'
+            self.log.warn("Exception closing watcher for %s: %s", name, str(ex))

     def processLoop(self):
         for device in self.devices:
             if device.name in self.wmiprobs:
-                self.log.debug("WMI problems on %s: skipping" % device.name)
+                #self.log.debug("WMI problems on %s: skipping" % device.name)
+                #continue
+               self.log.warn("WMI problems on %s: attempting processing anyway" % device.name)
+            if device.name in self.excludeDevices:
+                self.log.warn("Hardcoded exclusion of device %s", device.name)
                 continue
             try:
                 self.processDevice(device)
             except Exception, ex:
+                w = self.watchers.get(device.name, None)
+                self.closeWatcher(w, device.name)
                 self.deviceDown(device, str(ex))