Asnumpy() fails regularly when running MXNet on AWS Lambda


#1

I am running a pre trained model on AWS Lambda. I have issues with converting the mx NDArray into numpy. This error is reproducible for MXNet 0.10 and 0.12. It does work some time but I am pretty sure from the debug statement that the error is when I run the .asnumpy() call on the ndarray.

I suspect the issue is that the model returns a huge ndarray, but see no reason why this basic call should fail.

https://s3.amazonaws.com/smallya-test/mxnet_lambda_code.zip


#2

It would help if you can post the error message. It would be ideal if you can also post a minimal code snippet that would reproduce the error.


#3

Its running in the lambda environment so I can’t capture the output or the core. the process exits with message in cloudwatch logs : core.python2.7.1 539111424 ; All print statements prior to .asnumpy() execute. This has been reproduced by multiple people in the community.

Here’s the code: also available in the zip file.

‘’‘
Reference code to showcase MXNet model prediction on AWS Lambda
’’’

import base64
import os
import boto3
import json
import tempfile
import urllib2
from urllib import urlretrieve

Check if Lambda Function

if os.environ.get(‘LAMBDA_TASK_ROOT’) is None:
print “just exit, we are not in a lambda function”,
import sys; sys.exit(0)

from geopy.geocoders import Nominatim
geolocator = Nominatim()

print all files in tmp directory

import os
for f in os.listdir("/tmp"):
f_path = “/tmp/” + f
print f, os.stat(f_path).st_size
os.unlink(f_path)

import mxnet as mx
import numpy as np

from PIL import Image
from io import BytesIO
from collections import namedtuple
Batch = namedtuple(‘Batch’, [‘data’])

Gloabls

grids, ground_truth = [], {}

f_params = 'geo/RN101-5k500-0012.params’
f_symbol = ‘geo/RN101-5k500-symbol.json’

bucket = 'smallya-test’
s3 = boto3.resource(‘s3’)
s3_client = boto3.client(‘s3’)

load labels

with open(‘grids.txt’, ‘r’) as f:
for line in f:
line = line.strip().split(’\t’)
lat = float(line[1])
lng = float(line[2])
grids.append((lat, lng))

Load model

def load_model(s_fname, p_fname):
""“
Load model checkpoint from file.
:return: (arg_params, aux_params)
arg_params : dict of str to NDArray
Model parameter, dict of name to NDArray of net’s weights.
aux_params : dict of str to NDArray
Model parameter, dict of name to NDArray of net’s auxiliary states.
”""
symbol = mx.symbol.load(s_fname)
save_dict = mx.nd.load(p_fname)
arg_params = {}
aux_params = {}
for k, v in save_dict.items():
tp, name = k.split(’:’, 1)
if tp == ‘arg’:
arg_params[name] = v
if tp == ‘aux’:
aux_params[name] = v
return symbol, arg_params, aux_params

load labels

with open(‘grids.txt’, ‘r’) as f:
for line in f:
line = line.strip().split(’\t’)
lat = float(line[1])
lng = float(line[2])
grids.append((lat, lng))

mod = None

#params
f_params_file = tempfile.NamedTemporaryFile(delete=True)
urlretrieve(“https://s3.amazonaws.com/smallya-test/geo/RN101-5k500-0012.params”, f_params_file.name)
f_params_file.flush()

#symbol
f_symbol_file = tempfile.NamedTemporaryFile(delete=True)
urlretrieve(“https://s3.amazonaws.com/smallya-test/geo/RN101-5k500-symbol.json”, f_symbol_file.name)
f_symbol_file.flush()

sym, arg_params, aux_params = load_model(f_symbol_file.name, f_params_file.name)
mod = mx.mod.Module(symbol=sym, label_names=None)
mod.bind(for_training=False, data_shapes=[(‘data’, (1,3,224,224))], label_shapes=mod._label_shapes)
mod.set_params(arg_params, aux_params, allow_missing=True)

f_params_file.close()
f_symbol_file.close()

Helpers

def distance(p1, p2):
R = 6371 # Earth radius in km
lat1, lng1, lat2, lng2 = map(radians, (p1[0], p1[1], p2[0], p2[1]))
dlat = lat2 - lat1
dlng = lng2 - lng1
a = sin(dlat * 0.5) ** 2 + cos(lat1) * cos(lat2) * (sin(dlng * 0.5) ** 2)
return 2 * R * asin(sqrt(a))

mean image for preprocessing

mean_rgb = np.array([123.68, 116.779, 103.939])
mean_rgb = mean_rgb.reshape((3, 1, 1))

def predict(url, dataurl):
’’‘
predict labels for a given image
’’’

print "downloading the image"
img_file = tempfile.NamedTemporaryFile(delete=True)
if url:
    req = urllib2.urlopen(url)
    img_file.write(req.read())
    img_file.flush()
    img = Image.open(img_file.name)
elif dataurl:
    #convert to image
    img_data = dataurl.split(",")[1]
    if img_data[-2] != "=":
        img_data += "=" # pad it 
    img = Image.open(BytesIO(base64.b64decode(img_data))) 
    img = img.convert('RGB')

img_file.close()

# center crop and no resize
# ** width, height must be greater than new_width, new_height 
#new_width, new_height = 224, 224
#width, height = img.size   # Get dimensions
#left = (width - new_width)/2
#top = (height - new_height)/2
#right = (width + new_width)/2
#bottom = (height + new_height)/2
#img = img.crop((left, top, right, bottom))

# preprocess by cropping to shorter side and then resize
short_side = min(img.size)
left = int((img.size[0] - short_side) / 2)
right = left + short_side
top = int((img.size[1] - short_side) / 2)
bottom = top + short_side
img = img.crop((left, top, right, bottom))
img = img.resize((224, 224), Image.ANTIALIAS)

# convert to numpy.ndarray
sample = np.asarray(img)  
# swap axes to make image from (224, 224, 3) to (3, 224, 224)
sample = np.swapaxes(sample, 0, 2)
sample = np.swapaxes(sample, 1, 2)
sample = sample[np.newaxis, :] 
print sample.shape

# sub mean? 
normed_img = sample - mean_rgb
normed_img = normed_img.reshape((1, 3, 224, 224))

mod.forward(Batch([mx.nd.array(normed_img)]), is_train=False)
prob = mod.get_outputs()[0] 
#prob = prob.asnumpy()[0]
#pred = np.argsort(prob)[::-1]
# .asnumpy() seems to fail of large arrays.
pred = mx.ndarray.argsort(prob[0])
pred = pred.asnumpy()[::-1]
print "PRED", pred
idx = pred[0]
idx = int(idx)
lat, lng = grids[idx] #top result
# lat, lng
return lat, lng

def lambda_handler(event, context):

#url = 'http://www.japantimes.co.jp/wp-content/uploads/2016/03/n-tower-e-20160302.jpg'
url = None 
data_url = None

try:
    # API Gateway GET method
    print "Request Method:", event['httpMethod']
    if event['httpMethod'] == 'GET':
        url = event['queryStringParameters']['url']
    #API Gateway POST method
    elif event['httpMethod'] == 'POST':
        data = json.loads(event['body'])
        if data.has_key('dataurl'):
            data_url = data['dataurl']
        else:
            url = data['url']
        
except KeyError:
    # direct invocation
    url = event['url']

print "URL:" , url
lat, lng = predict(url, data_url)
latlng = "%s, %s" % (lat,lng)
loc = geolocator.reverse(latlng)
print "LOC:" , loc 

out = {
        "headers": {
            "content-type": "application/json",
            "Access-Control-Allow-Origin": "*"
            },
        "body": '{"address": "%s", "latlng": "%s"}' % (loc[0], loc[1]),  
        "statusCode": 200
      }
return out

#4

How much memory do you have available for the lambda function?
what is the size of the ndarray you are attempting to call asnumpy on?


#5

total memory 1.5GB; successful calls tend to use 1100-1200MB, so memory isn’t an issue here.

<NDArray 15527 @cpu(0)> ; pretty sizable. But this issue has been reproduced on imagenet as well so 1000!


#6

Can you provide the steps to reproduce the issue? Does it only happens on lambda? Which file do I run? lambda_code.py or lambda_function.py?
What error messages did you see specifically?


#7

lambda_function.py ; Please look at the ipynb file to deploy the lambda function easily with SAM.

You can try and run this on a ec2 instance running Amazon Linux for Lambda: Public Amazon Linux AMI version (AMI name: amzn-ami-hvm-2017.03.1.20170812-x86_64-gp2) which can be accessed here.


#8

Sure, I can take a look at the ipynb file. I wonder if there’s a way to reproduce it without lambda? Lambda makes it really hard to debug the code and dive deeper.


#9

do you have an update?


#10

while trying to replicate this on ec2 linux with the package above I get the following error; I get the same error without pdb as well.

-> mod.forward(Batch([mx.nd.array(normed_img)]), is_train=False)
(Pdb) n

/home/ec2-user/mx-lambda-test/ec_lh.py(156)predict()
-> prob = mod.get_outputs()[0]
(Pdb) Illegal instruction


#11

Hi Sunil,

I’ve seen Illegal instruction error when a buggy version of openblas is installed. What version of openblas did you install? I suspect this might be the cause.

I’m not able to reproduce the illegal instruction error on the AMI you suggested. Note that I installed the most recent version of openblas. Below please find the result when I ran it:

(note that I downloaded https://images-na.ssl-images-amazon.com/images/G/01/img15/pet-products/small-tiles/23695_pets_vertical_store_dogs_small_tile_8._CB312176604_.jpg and renamed it to test.jpg, let me know if you’re using a different input)

[ec2-user@ip-172-31-28-173 mxnet]$ LD_LIBRARY_PATH=/opt/OpenBLAS/lib/ python ../debug.py
[21:56:47] src/nnvm/legacy_json_util.cc:190: Loading symbol saved by previous version v0.9.4. Attempting to upgrade...
[21:56:47] src/nnvm/legacy_json_util.cc:198: Symbol successfully upgraded!
Request Method: URL: https://images-na.ssl-images-amazon.com/images/G/01/img15/pet-products/small-tiles/23695_pets_vertical_store_dogs_small_tile_8._CB312176604_.jpg
downloading the image
(1, 3, 224, 224)
0.12.0
> /home/ec2-user/debug.py(123)predict()
-> mod.forward(Batch([mx.nd.array(normed_img)]), is_train=False)
(Pdb) n
> /home/ec2-user/debug.py(124)predict()
-> print mod.get_outputs()
(Pdb) n
[
[[  1.06923224e-07   4.32113353e-07   1.49437668e-07 ...,   5.89478191e-07
    1.11700139e-07   5.48152670e-08]]
<NDArray 1x15527 @cpu(0)>]
> /home/ec2-user/debug.py(125)predict()
-> prob = mod.get_outputs()[0]
(Pdb) n
> /home/ec2-user/debug.py(130)predict()
-> pred = mx.ndarray.argsort(prob[0])
(Pdb) n
> /home/ec2-user/debug.py(131)predict()
-> pred = pred.asnumpy()[::-1]
(Pdb) n
> /home/ec2-user/debug.py(132)predict()
-> print "PRED", pred
(Pdb) n
PRED [  5460.   7800.  13555. ...,   4758.  12259.  12295.]
> /home/ec2-user/debug.py(133)predict()
-> idx = pred[0]
...

#12

The illegal operation is probably a red herring. I haven’t been able to reproduce the bug on ec2 as well. But it fails on Lambda consistently which uses the same AMI. Can you please try this on Lambda and see it for yourself?


#13

I’m trying your ipython notebook. I am quite new to lambda and have a dumb question:
I failed to execute the line:

!aws lambda update-function-code --function-name $func_name --zip-file fileb://$code_zip_name  --region us-west-2

An error occurred (ResourceNotFoundException) when calling the UpdateFunctionCode operation: Function not found: arn:aws:lambda:us-west-2:968277166688:function:MX-LAMBDA-GEOLOCATION-LambdaFunction-5MR66LKLX01L

and I am not 100% sure how to use the create-lambda command for my account. What value should I use for role, runtime and handler?

I tried

aws lambda create-function \
--region us-west-2 \
--function-name helloworld \
--zip-file fileb://mxnet_lambda_code.zip \
--role role:role-arn \
--runtime python2.7 \
> --handler lambda_handler


An error occurred (ValidationException) when calling the CreateFunction operation: 1 validation error detected: Value 'role-arn' at 'role' failed to satisfy constraint: Member must satisfy regular expression pattern: arn:(aws[a-zA-Z-]*)?:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+